001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.CDataNode;
005import org.jsoup.nodes.Comment;
006import org.jsoup.nodes.Document;
007import org.jsoup.nodes.DocumentType;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Entities;
010import org.jsoup.nodes.LeafNode;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.TextNode;
013import org.jsoup.nodes.XmlDeclaration;
014
015import java.io.Reader;
016import java.io.StringReader;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceXml;
020
021/**
022 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
023 * document.
024 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
025 *
026 * @author Jonathan Hedley
027 */
028public class XmlTreeBuilder extends TreeBuilder {
029    @Override ParseSettings defaultSettings() {
030        return ParseSettings.preserveCase;
031    }
032
033    @Override
034    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
035        super.initialiseParse(input, baseUri, parser);
036        doc.outputSettings()
037            .syntax(Document.OutputSettings.Syntax.xml)
038            .escapeMode(Entities.EscapeMode.xhtml)
039            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
040    }
041
042    Document parse(Reader input, String baseUri) {
043        return parse(input, baseUri, new Parser(this));
044    }
045
046    Document parse(String input, String baseUri) {
047        return parse(new StringReader(input), baseUri, new Parser(this));
048    }
049
050    @Override List<Node> completeParseFragment() {
051        return doc.childNodes();
052    }
053
054    @Override
055    XmlTreeBuilder newInstance() {
056        return new XmlTreeBuilder();
057    }
058
059    @Override public String defaultNamespace() {
060        return NamespaceXml;
061    }
062
063    @Override
064    protected boolean process(Token token) {
065        currentToken = token;
066
067        // start tag, end tag, doctype, xmldecl, comment, character, eof
068        switch (token.type) {
069            case StartTag:
070                insertElementFor(token.asStartTag());
071                break;
072            case EndTag:
073                popStackToClose(token.asEndTag());
074                break;
075            case Comment:
076                insertCommentFor(token.asComment());
077                break;
078            case Character:
079                insertCharacterFor(token.asCharacter());
080                break;
081            case Doctype:
082                insertDoctypeFor(token.asDoctype());
083                break;
084            case XmlDecl:
085                insertXmlDeclarationFor(token.asXmlDecl());
086                break;
087            case EOF: // could put some normalisation here if desired
088                break;
089            default:
090                Validate.fail("Unexpected token type: " + token.type);
091        }
092        return true;
093    }
094
095    void insertElementFor(Token.StartTag startTag) {
096        Tag tag = tagFor(startTag.name(), startTag.normalName(), defaultNamespace(), settings);
097        if (startTag.attributes != null)
098            startTag.attributes.deduplicate(settings);
099
100        Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes));
101        currentElement().appendChild(el);
102        push(el);
103
104        if (startTag.isSelfClosing()) {
105            tag.setSelfClosing();
106            pop(); // push & pop ensures onNodeInserted & onNodeClosed
107        }
108    }
109
110    void insertLeafNode(LeafNode node) {
111        currentElement().appendChild(node);
112        onNodeInserted(node);
113    }
114
115    void insertCommentFor(Token.Comment commentToken) {
116        Comment comment = new Comment(commentToken.getData());
117        LeafNode insert = comment;
118        if (commentToken.bogus && comment.isXmlDeclaration()) {
119            // xml declarations are emitted as bogus comments (which is right for html, but not xml)
120            // so we do a bit of a hack and parse the data as an element to pull the attributes out
121            // todo - refactor this to parse more appropriately
122            XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment
123            if (decl != null)
124                insert = decl;
125        }
126        insertLeafNode(insert);
127    }
128
129    void insertCharacterFor(Token.Character token) {
130        final String data = token.getData();
131        insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data));
132    }
133
134    void insertDoctypeFor(Token.Doctype token) {
135        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
136        doctypeNode.setPubSysKey(token.getPubSysKey());
137        insertLeafNode(doctypeNode);
138    }
139
140    void insertXmlDeclarationFor(Token.XmlDecl token) {
141        XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration);
142        if (token.attributes != null) decl.attributes().addAll(token.attributes);
143        insertLeafNode(decl);
144    }
145
146    /**
147     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
148     * found, skips.
149     *
150     * @param endTag tag to close
151     */
152    protected void popStackToClose(Token.EndTag endTag) {
153        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
154        String elName = settings.normalizeTag(endTag.tagName);
155        Element firstFound = null;
156
157        final int bottom = stack.size() - 1;
158        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
159
160        for (int pos = stack.size() -1; pos >= upper; pos--) {
161            Element next = stack.get(pos);
162            if (next.nodeName().equals(elName)) {
163                firstFound = next;
164                break;
165            }
166        }
167        if (firstFound == null)
168            return; // not found, skip
169
170        for (int pos = stack.size() -1; pos >= 0; pos--) {
171            Element next = pop();
172            if (next == firstFound) {
173                break;
174            }
175        }
176    }
177    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
178}