001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.nodes.CDataNode;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.DataNode;
010import org.jsoup.nodes.Document;
011import org.jsoup.nodes.DocumentType;
012import org.jsoup.nodes.Element;
013import org.jsoup.nodes.Entities;
014import org.jsoup.nodes.LeafNode;
015import org.jsoup.nodes.Node;
016import org.jsoup.nodes.TextNode;
017import org.jsoup.nodes.XmlDeclaration;
018import org.jsoup.select.Elements;
019import org.jspecify.annotations.Nullable;
020
021import java.io.Reader;
022import java.io.StringReader;
023import java.util.ArrayDeque;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import static org.jsoup.parser.Parser.NamespaceXml;
029
030/**
031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
032 * document.
033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
034 *
035 * @author Jonathan Hedley
036 */
037public class XmlTreeBuilder extends TreeBuilder {
038    static final String XmlnsKey = "xmlns";
039    static final String XmlnsPrefix = "xmlns:";
040    private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn
041
042    @Override ParseSettings defaultSettings() {
043        return ParseSettings.preserveCase;
044    }
045
046    @Override
047    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
048        super.initialiseParse(input, baseUri, parser);
049        doc.outputSettings()
050            .syntax(Document.OutputSettings.Syntax.xml)
051            .escapeMode(Entities.EscapeMode.xhtml)
052            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
053
054        namespacesStack.clear();
055        HashMap<String, String> ns = new HashMap<>();
056        ns.put("xml", NamespaceXml);
057        ns.put("", NamespaceXml);
058        namespacesStack.push(ns);
059    }
060
061    @Override
062    void initialiseParseFragment(@Nullable Element context) {
063        super.initialiseParseFragment(context);
064        if (context == null) return;
065
066        // transition to the tag's text state if available
067        TokeniserState textState = context.tag().textState();
068        if (textState != null) tokeniser.transition(textState);
069
070        // reconstitute the namespace stack by traversing the element and its parents (top down)
071        Elements chain = context.parents();
072        chain.add(0, context);
073        for (int i = chain.size() - 1; i >= 0; i--) {
074            Element el = chain.get(i);
075            HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
076            namespacesStack.push(namespaces);
077            if (el.attributesSize() > 0) {
078                processNamespaces(el.attributes(), namespaces);
079            }
080        }
081    }
082
083    Document parse(Reader input, String baseUri) {
084        return parse(input, baseUri, new Parser(this));
085    }
086
087    Document parse(String input, String baseUri) {
088        return parse(new StringReader(input), baseUri, new Parser(this));
089    }
090
091    @Override List<Node> completeParseFragment() {
092        return doc.childNodes();
093    }
094
095    @Override
096    XmlTreeBuilder newInstance() {
097        return new XmlTreeBuilder();
098    }
099
100    @Override public String defaultNamespace() {
101        return NamespaceXml;
102    }
103
104    @Override
105    TagSet defaultTagSet() {
106        return new TagSet(); // an empty tagset
107    }
108
109    @Override
110    protected boolean process(Token token) {
111        currentToken = token;
112
113        // start tag, end tag, doctype, xmldecl, comment, character, eof
114        switch (token.type) {
115            case StartTag:
116                insertElementFor(token.asStartTag());
117                break;
118            case EndTag:
119                popStackToClose(token.asEndTag());
120                break;
121            case Comment:
122                insertCommentFor(token.asComment());
123                break;
124            case Character:
125                insertCharacterFor(token.asCharacter());
126                break;
127            case Doctype:
128                insertDoctypeFor(token.asDoctype());
129                break;
130            case XmlDecl:
131                insertXmlDeclarationFor(token.asXmlDecl());
132                break;
133            case EOF: // could put some normalisation here if desired
134                break;
135            default:
136                Validate.fail("Unexpected token type: " + token.type);
137        }
138        return true;
139    }
140
141    void insertElementFor(Token.StartTag startTag) {
142        // handle namespace for tag
143        HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
144        namespacesStack.push(namespaces);
145
146        Attributes attributes = startTag.attributes;
147        if (attributes != null) {
148            attributes.deduplicate(settings);
149            processNamespaces(attributes, namespaces);
150            applyNamespacesToAttributes(attributes, namespaces);
151        }
152
153        String tagName = startTag.tagName.value();
154        String ns = resolveNamespace(tagName, namespaces);
155        Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
156        Element el = new Element(tag, null, settings.normalizeAttributes(attributes));
157        currentElement().appendChild(el);
158        push(el);
159
160        if (startTag.isSelfClosing()) {
161            tag.setSeenSelfClose();
162            pop(); // push & pop ensures onNodeInserted & onNodeClosed
163        } else if (tag.isEmpty()) {
164            pop(); // custom defined void tag
165        } else {
166            TokeniserState textState = tag.textState();
167            if (textState != null) tokeniser.transition(textState);
168        }
169    }
170
171    private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) {
172        // process attributes for namespaces (xmlns, xmlns:)
173        for (Attribute attr : attributes) {
174            String key = attr.getKey();
175            String value = attr.getValue();
176            if (key.equals(XmlnsKey)) {
177                namespaces.put("", value); // new default for this level
178            } else if (key.startsWith(XmlnsPrefix)) {
179                String nsPrefix = key.substring(XmlnsPrefix.length());
180                namespaces.put(nsPrefix, value);
181            }
182        }
183    }
184
185    private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) {
186        // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute)
187        Map<String, String> attrPrefix = new HashMap<>();
188        for (Attribute attr: attributes) {
189            String prefix = attr.prefix();
190            if (!prefix.isEmpty()) {
191                if (prefix.equals(XmlnsKey)) continue;
192                String ns = namespaces.get(prefix);
193                if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns);
194            }
195        }
196        for (Map.Entry<String, String> entry : attrPrefix.entrySet())
197            attributes.userData(entry.getKey(), entry.getValue());
198    }
199
200    private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) {
201        String ns = namespaces.get("");
202        int pos = tagName.indexOf(':');
203        if (pos > 0) {
204            String prefix = tagName.substring(0, pos);
205            if (namespaces.containsKey(prefix))
206                ns = namespaces.get(prefix);
207        }
208        return ns;
209    }
210
211    void insertLeafNode(LeafNode node) {
212        currentElement().appendChild(node);
213        onNodeInserted(node);
214    }
215
216    void insertCommentFor(Token.Comment commentToken) {
217        Comment comment = new Comment(commentToken.getData());
218        insertLeafNode(comment);
219    }
220
221    void insertCharacterFor(Token.Character token) {
222        final String data = token.getData();
223        LeafNode node;
224        if      (token.isCData())                       node = new CDataNode(data);
225        else if (currentElement().tag().is(Tag.Data))   node = new DataNode(data);
226        else                                            node = new TextNode(data);
227        insertLeafNode(node);
228    }
229
230    void insertDoctypeFor(Token.Doctype token) {
231        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
232        doctypeNode.setPubSysKey(token.getPubSysKey());
233        insertLeafNode(doctypeNode);
234    }
235
236    void insertXmlDeclarationFor(Token.XmlDecl token) {
237        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
238        if (token.attributes != null) decl.attributes().addAll(token.attributes);
239        insertLeafNode(decl);
240    }
241
242    @Override
243    Element pop() {
244        namespacesStack.pop();
245        return super.pop();
246    }
247
248    /**
249     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
250     * found, skips.
251     *
252     * @param endTag tag to close
253     */
254    protected void popStackToClose(Token.EndTag endTag) {
255        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
256        String elName = settings.normalizeTag(endTag.name());
257        Element firstFound = null;
258
259        final int bottom = stack.size() - 1;
260        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
261
262        for (int pos = stack.size() -1; pos >= upper; pos--) {
263            Element next = stack.get(pos);
264            if (next.nodeName().equals(elName)) {
265                firstFound = next;
266                break;
267            }
268        }
269        if (firstFound == null)
270            return; // not found, skip
271
272        for (int pos = stack.size() -1; pos >= 0; pos--) {
273            Element next = pop();
274            if (next == firstFound) {
275                break;
276            }
277        }
278    }
279    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
280}