001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.DocumentType; 012import org.jsoup.nodes.Element; 013import org.jsoup.nodes.Entities; 014import org.jsoup.nodes.LeafNode; 015import org.jsoup.nodes.Node; 016import org.jsoup.nodes.TextNode; 017import org.jsoup.nodes.XmlDeclaration; 018import org.jsoup.select.Elements; 019import org.jspecify.annotations.Nullable; 020 021import java.io.Reader; 022import java.io.StringReader; 023import java.util.ArrayDeque; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import static org.jsoup.parser.Parser.NamespaceXml; 029 030/** 031 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 032 * document. 033 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 034 * 035 * @author Jonathan Hedley 036 */ 037public class XmlTreeBuilder extends TreeBuilder { 038 static final String XmlnsKey = "xmlns"; 039 static final String XmlnsPrefix = "xmlns:"; 040 private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn 041 042 @Override ParseSettings defaultSettings() { 043 return ParseSettings.preserveCase; 044 } 045 046 @Override 047 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 048 super.initialiseParse(input, baseUri, parser); 049 doc.outputSettings() 050 .syntax(Document.OutputSettings.Syntax.xml) 051 .escapeMode(Entities.EscapeMode.xhtml) 052 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 053 054 namespacesStack.clear(); 055 HashMap<String, String> ns = new HashMap<>(); 056 ns.put("xml", NamespaceXml); 057 ns.put("", NamespaceXml); 058 namespacesStack.push(ns); 059 } 060 061 @Override 062 void initialiseParseFragment(@Nullable Element context) { 063 super.initialiseParseFragment(context); 064 if (context == null) return; 065 066 // transition to the tag's text state if available 067 TokeniserState textState = context.tag().textState(); 068 if (textState != null) tokeniser.transition(textState); 069 070 // reconstitute the namespace stack by traversing the element and its parents (top down) 071 Elements chain = context.parents(); 072 chain.add(0, context); 073 for (int i = chain.size() - 1; i >= 0; i--) { 074 Element el = chain.get(i); 075 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 076 namespacesStack.push(namespaces); 077 if (el.attributesSize() > 0) { 078 processNamespaces(el.attributes(), namespaces); 079 } 080 } 081 } 082 083 Document parse(Reader input, String baseUri) { 084 return parse(input, baseUri, new Parser(this)); 085 } 086 087 Document parse(String input, String baseUri) { 088 return parse(new StringReader(input), baseUri, new Parser(this)); 089 } 090 091 @Override List<Node> completeParseFragment() { 092 return doc.childNodes(); 093 } 094 095 @Override 096 XmlTreeBuilder newInstance() { 097 return new XmlTreeBuilder(); 098 } 099 100 @Override public String defaultNamespace() { 101 return NamespaceXml; 102 } 103 104 @Override 105 TagSet defaultTagSet() { 106 return new TagSet(); // an empty tagset 107 } 108 109 @Override 110 protected boolean process(Token token) { 111 currentToken = token; 112 113 // start tag, end tag, doctype, xmldecl, comment, character, eof 114 switch (token.type) { 115 case StartTag: 116 insertElementFor(token.asStartTag()); 117 break; 118 case EndTag: 119 popStackToClose(token.asEndTag()); 120 break; 121 case Comment: 122 insertCommentFor(token.asComment()); 123 break; 124 case Character: 125 insertCharacterFor(token.asCharacter()); 126 break; 127 case Doctype: 128 insertDoctypeFor(token.asDoctype()); 129 break; 130 case XmlDecl: 131 insertXmlDeclarationFor(token.asXmlDecl()); 132 break; 133 case EOF: // could put some normalisation here if desired 134 break; 135 default: 136 Validate.fail("Unexpected token type: " + token.type); 137 } 138 return true; 139 } 140 141 void insertElementFor(Token.StartTag startTag) { 142 // handle namespace for tag 143 HashMap<String, String> namespaces = new HashMap<>(namespacesStack.peek()); 144 namespacesStack.push(namespaces); 145 146 Attributes attributes = startTag.attributes; 147 if (attributes != null) { 148 settings.normalizeAttributes(attributes); 149 attributes.deduplicate(settings); 150 processNamespaces(attributes, namespaces); 151 applyNamespacesToAttributes(attributes, namespaces); 152 } 153 154 String tagName = startTag.tagName.value(); 155 String ns = resolveNamespace(tagName, namespaces); 156 Tag tag = tagFor(tagName, startTag.normalName, ns, settings); 157 Element el = new Element(tag, null, attributes); 158 currentElement().appendChild(el); 159 push(el); 160 161 if (startTag.isSelfClosing()) { 162 tag.setSeenSelfClose(); 163 pop(); // push & pop ensures onNodeInserted & onNodeClosed 164 } else if (tag.isEmpty()) { 165 pop(); // custom defined void tag 166 } else { 167 TokeniserState textState = tag.textState(); 168 if (textState != null) tokeniser.transition(textState); 169 } 170 } 171 172 private static void processNamespaces(Attributes attributes, HashMap<String, String> namespaces) { 173 // process attributes for namespaces (xmlns, xmlns:) 174 for (Attribute attr : attributes) { 175 String key = attr.getKey(); 176 String value = attr.getValue(); 177 if (key.equals(XmlnsKey)) { 178 namespaces.put("", value); // new default for this level 179 } else if (key.startsWith(XmlnsPrefix)) { 180 String nsPrefix = key.substring(XmlnsPrefix.length()); 181 namespaces.put(nsPrefix, value); 182 } 183 } 184 } 185 186 private static void applyNamespacesToAttributes(Attributes attributes, HashMap<String, String> namespaces) { 187 // second pass, apply namespace to attributes. Collects them first then adds (as userData is an attribute) 188 Map<String, String> attrPrefix = new HashMap<>(); 189 for (Attribute attr: attributes) { 190 String prefix = attr.prefix(); 191 if (!prefix.isEmpty()) { 192 if (prefix.equals(XmlnsKey)) continue; 193 String ns = namespaces.get(prefix); 194 if (ns != null) attrPrefix.put(SharedConstants.XmlnsAttr + prefix, ns); 195 } 196 } 197 for (Map.Entry<String, String> entry : attrPrefix.entrySet()) 198 attributes.userData(entry.getKey(), entry.getValue()); 199 } 200 201 private static String resolveNamespace(String tagName, HashMap<String, String> namespaces) { 202 String ns = namespaces.get(""); 203 int pos = tagName.indexOf(':'); 204 if (pos > 0) { 205 String prefix = tagName.substring(0, pos); 206 if (namespaces.containsKey(prefix)) 207 ns = namespaces.get(prefix); 208 } 209 return ns; 210 } 211 212 void insertLeafNode(LeafNode node) { 213 currentElement().appendChild(node); 214 onNodeInserted(node); 215 } 216 217 void insertCommentFor(Token.Comment commentToken) { 218 Comment comment = new Comment(commentToken.getData()); 219 insertLeafNode(comment); 220 } 221 222 void insertCharacterFor(Token.Character token) { 223 final String data = token.getData(); 224 LeafNode node; 225 if (token.isCData()) node = new CDataNode(data); 226 else if (currentElement().tag().is(Tag.Data)) node = new DataNode(data); 227 else node = new TextNode(data); 228 insertLeafNode(node); 229 } 230 231 void insertDoctypeFor(Token.Doctype token) { 232 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 233 doctypeNode.setPubSysKey(token.getPubSysKey()); 234 insertLeafNode(doctypeNode); 235 } 236 237 void insertXmlDeclarationFor(Token.XmlDecl token) { 238 XmlDeclaration decl = new XmlDeclaration(token.name(), token.isDeclaration); 239 if (token.attributes != null) decl.attributes().addAll(token.attributes); 240 insertLeafNode(decl); 241 } 242 243 @Override 244 Element pop() { 245 namespacesStack.pop(); 246 return super.pop(); 247 } 248 249 /** 250 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 251 * found, skips. 252 * 253 * @param endTag tag to close 254 */ 255 protected void popStackToClose(Token.EndTag endTag) { 256 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 257 String elName = settings.normalizeTag(endTag.name()); 258 Element firstFound = null; 259 260 final int bottom = stack.size() - 1; 261 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 262 263 for (int pos = stack.size() -1; pos >= upper; pos--) { 264 Element next = stack.get(pos); 265 if (next.nodeName().equals(elName)) { 266 firstFound = next; 267 break; 268 } 269 } 270 if (firstFound == null) 271 return; // not found, skip 272 273 for (int pos = stack.size() -1; pos >= 0; pos--) { 274 Element next = pop(); 275 if (next == firstFound) { 276 break; 277 } 278 } 279 } 280 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 281}