001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.DocumentType; 012import org.jsoup.nodes.Element; 013import org.jsoup.nodes.Entities; 014import org.jsoup.nodes.LeafNode; 015import org.jsoup.nodes.Node; 016import org.jsoup.nodes.TextNode; 017import org.jsoup.nodes.XmlDeclaration; 018import org.jsoup.select.Elements; 019import org.jspecify.annotations.Nullable; 020 021import java.io.Reader; 022import java.io.StringReader; 023import java.util.ArrayDeque; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import static org.jsoup.parser.Parser.NamespaceXml; 029 030/** 031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 032 * document. 033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 034 * 035 * @author Jonathan Hedley 036 */ 037public class XmlTreeBuilder extends TreeBuilder { 038 static final String XmlnsKey = "xmlns"; 039 static final String XmlnsPrefix = "xmlns:"; 040 private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn 041 042 @Override ParseSettings defaultSettings() { 043 return ParseSettings.preserveCase; 044 } 045 046 @Override 047 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 048 super.initialiseParse(input, baseUri, parser); 049 doc.outputSettings() 050 .syntax(Document.OutputSettings.Syntax.xml) 051 .escapeMode(Entities.EscapeMode.xhtml) 052 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 053 054 namespacesStack.clear(); 055 HashMap<String, String> ns = new HashMap<>(); 056 ns.put("xml", NamespaceXml); 057 ns.put("", NamespaceXml); 058 namespacesStack.push(ns); 059 } 060 061 @Override 062 void initialiseParseFragment(@Nullable Element context) { 063 super.initialiseParseFragment(context); 064 if (context == null) return; 065 066 // transition to the tag's text state if available 067 TokeniserState textState = context.tag().textState(); 068 if (textState != null) tokeniser.transition(textState); 069 070 // reconstitute the namespace stack by traversing the element and its parents (top down) 071 Elements chain = context.parents(); 072 chain.add(0, context); 073 for (int i = chain.size() - 1; i >= 0; i--) { 074 Element el = chain.get(i); 075 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 076 namespacesStack.push(namespaces); 077 if (el.attributesSize() > 0) { 078 processNamespaces(el.attributes(), namespaces); 079 } 080 } 081 } 082 083 Document parse(Reader input, String baseUri) { 084 return parse(input, baseUri, new Parser(this)); 085 } 086 087 Document parse(String input, String baseUri) { 088 return parse(new StringReader(input), baseUri, new Parser(this)); 089 } 090 091 @Override List<Node> completeParseFragment() { 092 return doc.childNodes(); 093 } 094 095 @Override 096 XmlTreeBuilder newInstance() { 097 return new XmlTreeBuilder(); 098 } 099 100 @Override public String defaultNamespace() { 101 return NamespaceXml; 102 } 103 104 @Override 105 TagSet defaultTagSet() { 106 return new TagSet(); // an empty tagset 107 } 108 109 @Override 110 protected boolean process(Token token) { 111 currentToken = token; 112 113 // start tag, end tag, doctype, xmldecl, comment, character, eof 114 switch (token.type) { 115 case StartTag: 116 insertElementFor(token.asStartTag()); 117 break; 118 case EndTag: 119 popStackToClose(token.asEndTag()); 120 break; 121 case Comment: 122 insertCommentFor(token.asComment()); 123 break; 124 case Character: 125 insertCharacterFor(token.asCharacter()); 126 break; 127 case Doctype: 128 insertDoctypeFor(token.asDoctype()); 129 break; 130 case XmlDecl: 131 insertXmlDeclarationFor(token.asXmlDecl()); 132 break; 133 case EOF: // could put some normalisation here if desired 134 break; 135 default: 136 Validate.fail("Unexpected token type: " + token.type); 137 } 138 return true; 139 } 140 141 void insertElementFor(Token.StartTag startTag) { 142 // handle namespace for tag 143 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 144 namespacesStack.push(namespaces); 145 146 Attributes attributes = startTag.attributes; 147 if (attributes != null) { 148 attributes.deduplicate(settings); 149 processNamespaces(attributes, namespaces); 150 applyNamespacesToAttributes(attributes, namespaces); 151 } 152 153 String tagName = startTag.tagName.value(); 154 String ns = resolveNamespace(tagName, namespaces); 155 Tag tag = tagFor(tagName, startTag.normalName, ns, settings); 156 Element el = new Element(tag, null, settings.normalizeAttributes(attributes)); 157 currentElement().appendChild(el); 158 push(el); 159 160 if (startTag.isSelfClosing()) { 161 tag.setSeenSelfClose(); 162 pop(); // push & pop ensures onNodeInserted & onNodeClosed 163 } else if (tag.isEmpty()) { 164 pop(); // custom defined void tag 165 } else { 166 TokeniserState textState = tag.textState(); 167 if (textState != null) tokeniser.transition(textState); 168 } 169 } 170 171 private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) { 172 // process attributes for namespaces (xmlns, xmlns:) 173 for (Attribute attr : attributes) { 174 String key = attr.getKey(); 175 String value = attr.getValue(); 176 if (key.equals(XmlnsKey)) { 177 namespaces.put("", value); // new default for this level 178 } else if (key.startsWith(XmlnsPrefix)) { 179 String nsPrefix = key.substring(XmlnsPrefix.length()); 180 namespaces.put(nsPrefix, value); 181 } 182 } 183 } 184 185 private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) { 186 // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute) 187 Map<String, String> attrPrefix = new HashMap<>(); 188 for (Attribute attr: attributes) { 189 String prefix = attr.prefix(); 190 if (!prefix.isEmpty()) { 191 if (prefix.equals(XmlnsKey)) continue; 192 String ns = namespaces.get(prefix); 193 if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns); 194 } 195 } 196 for (Map.Entry<String, String> entry : attrPrefix.entrySet()) 197 attributes.userData(entry.getKey(), entry.getValue()); 198 } 199 200 private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) { 201 String ns = namespaces.get(""); 202 int pos = tagName.indexOf(':'); 203 if (pos > 0) { 204 String prefix = tagName.substring(0, pos); 205 if (namespaces.containsKey(prefix)) 206 ns = namespaces.get(prefix); 207 } 208 return ns; 209 } 210 211 void insertLeafNode(LeafNode node) { 212 currentElement().appendChild(node); 213 onNodeInserted(node); 214 } 215 216 void insertCommentFor(Token.Comment commentToken) { 217 Comment comment = new Comment(commentToken.getData()); 218 insertLeafNode(comment); 219 } 220 221 void insertCharacterFor(Token.Character token) { 222 final String data = token.getData(); 223 LeafNode node; 224 if (token.isCData()) node = new CDataNode(data); 225 else if (currentElement().tag().is(Tag.Data)) node = new DataNode(data); 226 else node = new TextNode(data); 227 insertLeafNode(node); 228 } 229 230 void insertDoctypeFor(Token.Doctype token) { 231 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 232 doctypeNode.setPubSysKey(token.getPubSysKey()); 233 insertLeafNode(doctypeNode); 234 } 235 236 void insertXmlDeclarationFor(Token.XmlDecl token) { 237 XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration); 238 if (token.attributes != null) decl.attributes().addAll(token.attributes); 239 insertLeafNode(decl); 240 } 241 242 @Override 243 Element pop() { 244 namespacesStack.pop(); 245 return super.pop(); 246 } 247 248 /** 249 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 250 * found, skips. 251 * 252 * @param endTag tag to close 253 */ 254 protected void popStackToClose(Token.EndTag endTag) { 255 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 256 String elName = settings.normalizeTag(endTag.name()); 257 Element firstFound = null; 258 259 final int bottom = stack.size() - 1; 260 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 261 262 for (int pos = stack.size() -1; pos >= upper; pos--) { 263 Element next = stack.get(pos); 264 if (next.nodeName().equals(elName)) { 265 firstFound = next; 266 break; 267 } 268 } 269 if (firstFound == null) 270 return; // not found, skip 271 272 for (int pos = stack.size() -1; pos >= 0; pos--) { 273 Element next = pop(); 274 if (next == firstFound) { 275 break; 276 } 277 } 278 } 279 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 280}