001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.CDataNode; 005import org.jsoup.nodes.Comment; 006import org.jsoup.nodes.Document; 007import org.jsoup.nodes.DocumentType; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Entities; 010import org.jsoup.nodes.LeafNode; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.TextNode; 013import org.jsoup.nodes.XmlDeclaration; 014import org.jspecify.annotations.Nullable; 015 016import java.io.Reader; 017import java.io.StringReader; 018import java.util.List; 019 020import static org.jsoup.parser.Parser.NamespaceXml; 021 022/** 023 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 024 * document. 025 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 026 * 027 * @author Jonathan Hedley 028 */ 029public class XmlTreeBuilder extends TreeBuilder { 030 @Override ParseSettings defaultSettings() { 031 return ParseSettings.preserveCase; 032 } 033 034 @Override 035 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 036 super.initialiseParse(input, baseUri, parser); 037 doc.outputSettings() 038 .syntax(Document.OutputSettings.Syntax.xml) 039 .escapeMode(Entities.EscapeMode.xhtml) 040 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 041 } 042 043 Document parse(Reader input, String baseUri) { 044 return parse(input, baseUri, new Parser(this)); 045 } 046 047 Document parse(String input, String baseUri) { 048 return parse(new StringReader(input), baseUri, new Parser(this)); 049 } 050 051 @Override List<Node> completeParseFragment() { 052 return doc.childNodes(); 053 } 054 055 @Override 056 XmlTreeBuilder newInstance() { 057 return new XmlTreeBuilder(); 058 } 059 060 @Override public String defaultNamespace() { 061 return NamespaceXml; 062 } 063 064 @Override 065 protected boolean process(Token token) { 066 currentToken = token; 067 068 // start tag, end tag, doctype, comment, character, eof 069 switch (token.type) { 070 case StartTag: 071 insertElementFor(token.asStartTag()); 072 break; 073 case EndTag: 074 popStackToClose(token.asEndTag()); 075 break; 076 case Comment: 077 insertCommentFor(token.asComment()); 078 break; 079 case Character: 080 insertCharacterFor(token.asCharacter()); 081 break; 082 case Doctype: 083 insertDoctypeFor(token.asDoctype()); 084 break; 085 case EOF: // could put some normalisation here if desired 086 break; 087 default: 088 Validate.fail("Unexpected token type: " + token.type); 089 } 090 return true; 091 } 092 093 void insertElementFor(Token.StartTag startTag) { 094 Tag tag = tagFor(startTag.name(), settings); 095 if (startTag.attributes != null) 096 startTag.attributes.deduplicate(settings); 097 098 Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); 099 currentElement().appendChild(el); 100 push(el); 101 102 if (startTag.isSelfClosing()) { 103 tag.setSelfClosing(); 104 pop(); // push & pop ensures onNodeInserted & onNodeClosed 105 } 106 } 107 108 void insertLeafNode(LeafNode node) { 109 currentElement().appendChild(node); 110 onNodeInserted(node); 111 } 112 113 void insertCommentFor(Token.Comment commentToken) { 114 Comment comment = new Comment(commentToken.getData()); 115 LeafNode insert = comment; 116 if (commentToken.bogus && comment.isXmlDeclaration()) { 117 // xml declarations are emitted as bogus comments (which is right for html, but not xml) 118 // so we do a bit of a hack and parse the data as an element to pull the attributes out 119 // todo - refactor this to parse more appropriately 120 XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment 121 if (decl != null) 122 insert = decl; 123 } 124 insertLeafNode(insert); 125 } 126 127 void insertCharacterFor(Token.Character token) { 128 final String data = token.getData(); 129 insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data)); 130 } 131 132 void insertDoctypeFor(Token.Doctype token) { 133 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 134 doctypeNode.setPubSysKey(token.getPubSysKey()); 135 insertLeafNode(doctypeNode); 136 } 137 138 /** 139 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 140 * found, skips. 141 * 142 * @param endTag tag to close 143 */ 144 protected void popStackToClose(Token.EndTag endTag) { 145 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 146 String elName = settings.normalizeTag(endTag.tagName); 147 Element firstFound = null; 148 149 final int bottom = stack.size() - 1; 150 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 151 152 for (int pos = stack.size() -1; pos >= upper; pos--) { 153 Element next = stack.get(pos); 154 if (next.nodeName().equals(elName)) { 155 firstFound = next; 156 break; 157 } 158 } 159 if (firstFound == null) 160 return; // not found, skip 161 162 for (int pos = stack.size() -1; pos >= 0; pos--) { 163 Element next = pop(); 164 if (next == firstFound) { 165 break; 166 } 167 } 168 } 169 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 170}