001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.nodes.CDataNode;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.DataNode;
010import org.jsoup.nodes.Document;
011import org.jsoup.nodes.DocumentType;
012import org.jsoup.nodes.Element;
013import org.jsoup.nodes.Entities;
014import org.jsoup.nodes.LeafNode;
015import org.jsoup.nodes.Node;
016import org.jsoup.nodes.TextNode;
017import org.jsoup.nodes.XmlDeclaration;
018import org.jsoup.select.Elements;
019import org.jspecify.annotations.Nullable;
020
021import java.io.Reader;
022import java.io.StringReader;
023import java.util.ArrayDeque;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import static org.jsoup.parser.Parser.NamespaceXml;
029
030/**
031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
032 * document.
033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
034 *
035 * @author Jonathan Hedley
036 */
037public class XmlTreeBuilder extends TreeBuilder {
038    static final String XmlnsKey = "xmlns";
039    static final String XmlnsPrefix = "xmlns:";
040    private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn
041
042    @Override ParseSettings defaultSettings() {
043        return ParseSettings.preserveCase;
044    }
045
046    @Override
047    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
048        super.initialiseParse(input, baseUri, parser);
049        doc.outputSettings()
050            .syntax(Document.OutputSettings.Syntax.xml)
051            .escapeMode(Entities.EscapeMode.xhtml)
052            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
053
054        namespacesStack.clear();
055        HashMap<String, String> ns = new HashMap<>();
056        ns.put("xml", NamespaceXml);
057        ns.put("", NamespaceXml);
058        namespacesStack.push(ns);
059    }
060
061    @Override
062    void initialiseParseFragment(@Nullable Element context) {
063        super.initialiseParseFragment(context);
064        if (context == null) return;
065
066        // transition to the tag's text state if available
067        TokeniserState textState = context.tag().textState();
068        if (textState != null) tokeniser.transition(textState);
069
070        // reconstitute the namespace stack by traversing the element and its parents (top down)
071        Elements chain = context.parents();
072        chain.add(0, context);
073        for (int i = chain.size() - 1; i >= 0; i--) {
074            Element el = chain.get(i);
075            HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
076            namespacesStack.push(namespaces);
077            if (el.attributesSize() > 0) {
078                processNamespaces(el.attributes(), namespaces);
079            }
080        }
081    }
082
083    Document parse(Reader input, String baseUri) {
084        return parse(input, baseUri, new Parser(this));
085    }
086
087    Document parse(String input, String baseUri) {
088        return parse(new StringReader(input), baseUri, new Parser(this));
089    }
090
091    @Override List<Node> completeParseFragment() {
092        return doc.childNodes();
093    }
094
095    @Override
096    XmlTreeBuilder newInstance() {
097        return new XmlTreeBuilder();
098    }
099
100    @Override public String defaultNamespace() {
101        return NamespaceXml;
102    }
103
104    @Override
105    TagSet defaultTagSet() {
106        return new TagSet(); // an empty tagset
107    }
108
109    @Override
110    protected boolean process(Token token) {
111        currentToken = token;
112
113        // start tag, end tag, doctype, xmldecl, comment, character, eof
114        switch (token.type) {
115            case StartTag:
116                insertElementFor(token.asStartTag());
117                break;
118            case EndTag:
119                popStackToClose(token.asEndTag());
120                break;
121            case Comment:
122                insertCommentFor(token.asComment());
123                break;
124            case Character:
125                insertCharacterFor(token.asCharacter());
126                break;
127            case Doctype:
128                insertDoctypeFor(token.asDoctype());
129                break;
130            case XmlDecl:
131                insertXmlDeclarationFor(token.asXmlDecl());
132                break;
133            case EOF: // could put some normalisation here if desired
134                break;
135            default:
136                Validate.fail("Unexpected token type: " + token.type);
137        }
138        return true;
139    }
140
141    void insertElementFor(Token.StartTag startTag) {
142        // handle namespace for tag
143        HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek());
144        namespacesStack.push(namespaces);
145
146        Attributes attributes = startTag.attributes;
147        if (attributes != null) {
148            settings.normalizeAttributes(attributes);
149            attributes.deduplicate(settings);
150            processNamespaces(attributes, namespaces);
151            applyNamespacesToAttributes(attributes, namespaces);
152        }
153
154        String tagName = startTag.tagName.value();
155        String ns = resolveNamespace(tagName, namespaces);
156        Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
157        Element el = new Element(tag, null, attributes);
158        currentElement().appendChild(el);
159        push(el);
160
161        if (startTag.isSelfClosing()) {
162            tag.setSeenSelfClose();
163            pop(); // push & pop ensures onNodeInserted & onNodeClosed
164        } else if (tag.isEmpty()) {
165            pop(); // custom defined void tag
166        } else {
167            TokeniserState textState = tag.textState();
168            if (textState != null) tokeniser.transition(textState);
169        }
170    }
171
172    private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) {
173        // process attributes for namespaces (xmlns, xmlns:)
174        for (Attribute attr : attributes) {
175            String key = attr.getKey();
176            String value = attr.getValue();
177            if (key.equals(XmlnsKey)) {
178                namespaces.put("", value); // new default for this level
179            } else if (key.startsWith(XmlnsPrefix)) {
180                String nsPrefix = key.substring(XmlnsPrefix.length());
181                namespaces.put(nsPrefix, value);
182            }
183        }
184    }
185
186    private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) {
187        // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute)
188        Map<String, String> attrPrefix = new HashMap<>();
189        for (Attribute attr: attributes) {
190            String prefix = attr.prefix();
191            if (!prefix.isEmpty()) {
192                if (prefix.equals(XmlnsKey)) continue;
193                String ns = namespaces.get(prefix);
194                if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns);
195            }
196        }
197        for (Map.Entry<String, String> entry : attrPrefix.entrySet())
198            attributes.userData(entry.getKey(), entry.getValue());
199    }
200
201    private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) {
202        String ns = namespaces.get("");
203        int pos = tagName.indexOf(':');
204        if (pos > 0) {
205            String prefix = tagName.substring(0, pos);
206            if (namespaces.containsKey(prefix))
207                ns = namespaces.get(prefix);
208        }
209        return ns;
210    }
211
212    void insertLeafNode(LeafNode node) {
213        currentElement().appendChild(node);
214        onNodeInserted(node);
215    }
216
217    void insertCommentFor(Token.Comment commentToken) {
218        Comment comment = new Comment(commentToken.getData());
219        insertLeafNode(comment);
220    }
221
222    void insertCharacterFor(Token.Character token) {
223        final String data = token.getData();
224        LeafNode node;
225        if      (token.isCData())                       node = new CDataNode(data);
226        else if (currentElement().tag().is(Tag.Data))   node = new DataNode(data);
227        else                                            node = new TextNode(data);
228        insertLeafNode(node);
229    }
230
231    void insertDoctypeFor(Token.Doctype token) {
232        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
233        doctypeNode.setPubSysKey(token.getPubSysKey());
234        insertLeafNode(doctypeNode);
235    }
236
237    void insertXmlDeclarationFor(Token.XmlDecl token) {
238        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
239        if (token.attributes != null) decl.attributes().addAll(token.attributes);
240        insertLeafNode(decl);
241    }
242
243    @Override
244    Element pop() {
245        namespacesStack.pop();
246        return super.pop();
247    }
248
249    /**
250     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
251     * found, skips.
252     *
253     * @param endTag tag to close
254     */
255    protected void popStackToClose(Token.EndTag endTag) {
256        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
257        String elName = settings.normalizeTag(endTag.name());
258        Element firstFound = null;
259
260        final int bottom = stack.size() - 1;
261        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
262
263        for (int pos = stack.size() -1; pos >= upper; pos--) {
264            Element next = stack.get(pos);
265            if (next.nodeName().equals(elName)) {
266                firstFound = next;
267                break;
268            }
269        }
270        if (firstFound == null)
271            return; // not found, skip
272
273        for (int pos = stack.size() -1; pos >= 0; pos--) {
274            Element next = pop();
275            if (next == firstFound) {
276                break;
277            }
278        }
279    }
280    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
281}