001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.CDataNode; 005import org.jsoup.nodes.Comment; 006import org.jsoup.nodes.Document; 007import org.jsoup.nodes.DocumentType; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Entities; 010import org.jsoup.nodes.LeafNode; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.TextNode; 013import org.jsoup.nodes.XmlDeclaration; 014 015import java.io.Reader; 016import java.io.StringReader; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceXml; 020 021/** 022 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 023 * document. 024 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 025 * 026 * @author Jonathan Hedley 027 */ 028public class XmlTreeBuilder extends TreeBuilder { 029 @Override ParseSettings defaultSettings() { 030 return ParseSettings.preserveCase; 031 } 032 033 @Override 034 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 035 super.initialiseParse(input, baseUri, parser); 036 doc.outputSettings() 037 .syntax(Document.OutputSettings.Syntax.xml) 038 .escapeMode(Entities.EscapeMode.xhtml) 039 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 040 } 041 042 Document parse(Reader input, String baseUri) { 043 return parse(input, baseUri, new Parser(this)); 044 } 045 046 Document parse(String input, String baseUri) { 047 return parse(new StringReader(input), baseUri, new Parser(this)); 048 } 049 050 @Override List<Node> completeParseFragment() { 051 return doc.childNodes(); 052 } 053 054 @Override 055 XmlTreeBuilder newInstance() { 056 return new XmlTreeBuilder(); 057 } 058 059 @Override public String defaultNamespace() { 060 return NamespaceXml; 061 } 062 063 @Override 064 protected boolean process(Token token) { 065 currentToken = token; 066 067 // start tag, end tag, doctype, xmldecl, comment, character, eof 068 switch (token.type) { 069 case StartTag: 070 insertElementFor(token.asStartTag()); 071 break; 072 case EndTag: 073 popStackToClose(token.asEndTag()); 074 break; 075 case Comment: 076 insertCommentFor(token.asComment()); 077 break; 078 case Character: 079 insertCharacterFor(token.asCharacter()); 080 break; 081 case Doctype: 082 insertDoctypeFor(token.asDoctype()); 083 break; 084 case XmlDecl: 085 insertXmlDeclarationFor(token.asXmlDecl()); 086 break; 087 case EOF: // could put some normalisation here if desired 088 break; 089 default: 090 Validate.fail("Unexpected token type: " + token.type); 091 } 092 return true; 093 } 094 095 void insertElementFor(Token.StartTag startTag) { 096 Tag tag = tagFor(startTag.name(), startTag.normalName(), defaultNamespace(), settings); 097 if (startTag.attributes != null) 098 startTag.attributes.deduplicate(settings); 099 100 Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); 101 currentElement().appendChild(el); 102 push(el); 103 104 if (startTag.isSelfClosing()) { 105 tag.setSelfClosing(); 106 pop(); // push & pop ensures onNodeInserted & onNodeClosed 107 } 108 } 109 110 void insertLeafNode(LeafNode node) { 111 currentElement().appendChild(node); 112 onNodeInserted(node); 113 } 114 115 void insertCommentFor(Token.Comment commentToken) { 116 Comment comment = new Comment(commentToken.getData()); 117 LeafNode insert = comment; 118 if (commentToken.bogus && comment.isXmlDeclaration()) { 119 // xml declarations are emitted as bogus comments (which is right for html, but not xml) 120 // so we do a bit of a hack and parse the data as an element to pull the attributes out 121 // todo - refactor this to parse more appropriately 122 XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment 123 if (decl != null) 124 insert = decl; 125 } 126 insertLeafNode(insert); 127 } 128 129 void insertCharacterFor(Token.Character token) { 130 final String data = token.getData(); 131 insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data)); 132 } 133 134 void insertDoctypeFor(Token.Doctype token) { 135 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 136 doctypeNode.setPubSysKey(token.getPubSysKey()); 137 insertLeafNode(doctypeNode); 138 } 139 140 void insertXmlDeclarationFor(Token.XmlDecl token) { 141 XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration); 142 if (token.attributes != null) decl.attributes().addAll(token.attributes); 143 insertLeafNode(decl); 144 } 145 146 /** 147 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 148 * found, skips. 149 * 150 * @param endTag tag to close 151 */ 152 protected void popStackToClose(Token.EndTag endTag) { 153 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 154 String elName = settings.normalizeTag(endTag.tagName); 155 Element firstFound = null; 156 157 final int bottom = stack.size() - 1; 158 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 159 160 for (int pos = stack.size() -1; pos >= upper; pos--) { 161 Element next = stack.get(pos); 162 if (next.nodeName().equals(elName)) { 163 firstFound = next; 164 break; 165 } 166 } 167 if (firstFound == null) 168 return; // not found, skip 169 170 for (int pos = stack.size() -1; pos >= 0; pos--) { 171 Element next = pop(); 172 if (next == firstFound) { 173 break; 174 } 175 } 176 } 177 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 178}