001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.CDataNode;
005import org.jsoup.nodes.Comment;
006import org.jsoup.nodes.Document;
007import org.jsoup.nodes.DocumentType;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Entities;
010import org.jsoup.nodes.LeafNode;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.TextNode;
013import org.jsoup.nodes.XmlDeclaration;
014import org.jspecify.annotations.Nullable;
015
016import java.io.Reader;
017import java.io.StringReader;
018import java.util.List;
019
020import static org.jsoup.parser.Parser.NamespaceXml;
021
022/**
023 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
024 * document.
025 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
026 *
027 * @author Jonathan Hedley
028 */
029public class XmlTreeBuilder extends TreeBuilder {
030    @Override ParseSettings defaultSettings() {
031        return ParseSettings.preserveCase;
032    }
033
034    @Override
035    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
036        super.initialiseParse(input, baseUri, parser);
037        doc.outputSettings()
038            .syntax(Document.OutputSettings.Syntax.xml)
039            .escapeMode(Entities.EscapeMode.xhtml)
040            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
041    }
042
043    Document parse(Reader input, String baseUri) {
044        return parse(input, baseUri, new Parser(this));
045    }
046
047    Document parse(String input, String baseUri) {
048        return parse(new StringReader(input), baseUri, new Parser(this));
049    }
050
051    @Override List<Node> completeParseFragment() {
052        return doc.childNodes();
053    }
054
055    @Override
056    XmlTreeBuilder newInstance() {
057        return new XmlTreeBuilder();
058    }
059
060    @Override public String defaultNamespace() {
061        return NamespaceXml;
062    }
063
064    @Override
065    protected boolean process(Token token) {
066        currentToken = token;
067
068        // start tag, end tag, doctype, comment, character, eof
069        switch (token.type) {
070            case StartTag:
071                insertElementFor(token.asStartTag());
072                break;
073            case EndTag:
074                popStackToClose(token.asEndTag());
075                break;
076            case Comment:
077                insertCommentFor(token.asComment());
078                break;
079            case Character:
080                insertCharacterFor(token.asCharacter());
081                break;
082            case Doctype:
083                insertDoctypeFor(token.asDoctype());
084                break;
085            case EOF: // could put some normalisation here if desired
086                break;
087            default:
088                Validate.fail("Unexpected token type: " + token.type);
089        }
090        return true;
091    }
092
093    void insertElementFor(Token.StartTag startTag) {
094        Tag tag = tagFor(startTag.name(), settings);
095        if (startTag.attributes != null)
096            startTag.attributes.deduplicate(settings);
097
098        Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes));
099        currentElement().appendChild(el);
100        push(el);
101
102        if (startTag.isSelfClosing()) {
103            tag.setSelfClosing();
104            pop(); // push & pop ensures onNodeInserted & onNodeClosed
105        }
106    }
107
108    void insertLeafNode(LeafNode node) {
109        currentElement().appendChild(node);
110        onNodeInserted(node);
111    }
112
113    void insertCommentFor(Token.Comment commentToken) {
114        Comment comment = new Comment(commentToken.getData());
115        LeafNode insert = comment;
116        if (commentToken.bogus && comment.isXmlDeclaration()) {
117            // xml declarations are emitted as bogus comments (which is right for html, but not xml)
118            // so we do a bit of a hack and parse the data as an element to pull the attributes out
119            // todo - refactor this to parse more appropriately
120            XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment
121            if (decl != null)
122                insert = decl;
123        }
124        insertLeafNode(insert);
125    }
126
127    void insertCharacterFor(Token.Character token) {
128        final String data = token.getData();
129        insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data));
130    }
131
132    void insertDoctypeFor(Token.Doctype token) {
133        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
134        doctypeNode.setPubSysKey(token.getPubSysKey());
135        insertLeafNode(doctypeNode);
136    }
137
138    /**
139     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
140     * found, skips.
141     *
142     * @param endTag tag to close
143     */
144    protected void popStackToClose(Token.EndTag endTag) {
145        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
146        String elName = settings.normalizeTag(endTag.tagName);
147        Element firstFound = null;
148
149        final int bottom = stack.size() - 1;
150        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
151
152        for (int pos = stack.size() -1; pos >= upper; pos--) {
153            Element next = stack.get(pos);
154            if (next.nodeName().equals(elName)) {
155                firstFound = next;
156                break;
157            }
158        }
159        if (firstFound == null)
160            return; // not found, skip
161
162        for (int pos = stack.size() -1; pos >= 0; pos--) {
163            Element next = pop();
164            if (next == firstFound) {
165                break;
166            }
167        }
168    }
169    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
170}