001package org.jsoup.helper; 002 003import org.jsoup.internal.Normalizer; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.parser.HtmlTreeBuilder; 008import org.jsoup.parser.Parser; 009import org.jsoup.select.NodeTraversor; 010import org.jsoup.select.NodeVisitor; 011import org.jsoup.select.Selector; 012import org.w3c.dom.Comment; 013import org.w3c.dom.DOMException; 014import org.w3c.dom.DOMImplementation; 015import org.w3c.dom.Document; 016import org.w3c.dom.DocumentType; 017import org.w3c.dom.Element; 018import org.w3c.dom.Node; 019import org.w3c.dom.NodeList; 020import org.w3c.dom.Text; 021import org.jspecify.annotations.Nullable; 022 023import javax.xml.parsers.DocumentBuilder; 024import javax.xml.parsers.DocumentBuilderFactory; 025import javax.xml.parsers.ParserConfigurationException; 026import javax.xml.transform.OutputKeys; 027import javax.xml.transform.Transformer; 028import javax.xml.transform.TransformerException; 029import javax.xml.transform.TransformerFactory; 030import javax.xml.transform.dom.DOMSource; 031import javax.xml.transform.stream.StreamResult; 032import javax.xml.xpath.XPathConstants; 033import javax.xml.xpath.XPathExpression; 034import javax.xml.xpath.XPathExpressionException; 035import javax.xml.xpath.XPathFactory; 036import javax.xml.xpath.XPathFactoryConfigurationException; 037import java.io.StringWriter; 038import java.util.ArrayDeque; 039import java.util.ArrayList; 040import java.util.HashMap; 041import java.util.List; 042import java.util.Map; 043import java.util.Properties; 044 045import static javax.xml.transform.OutputKeys.METHOD; 046import static org.jsoup.nodes.Document.OutputSettings.Syntax; 047 048/** 049 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 050 * for integration with toolsets that use the W3C DOM. 051 */ 052public class W3CDom { 053 /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ 054 public static final String SourceProperty = "jsoupSource"; 055 private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc 056 private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context 057 058 /** 059 To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory 060 implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). 061 */ 062 public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; 063 064 protected DocumentBuilderFactory factory; 065 private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience 066 067 public W3CDom() { 068 factory = DocumentBuilderFactory.newInstance(); 069 factory.setNamespaceAware(true); 070 } 071 072 /** 073 Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity 074 when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. 075 @return the current namespace aware setting. 076 */ 077 public boolean namespaceAware() { 078 return namespaceAware; 079 } 080 081 /** 082 Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. 083 <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml} 084 namespace if otherwise unset.</p>. 085 @param namespaceAware the updated setting 086 @return this W3CDom, for chaining. 087 */ 088 public W3CDom namespaceAware(boolean namespaceAware) { 089 this.namespaceAware = namespaceAware; 090 factory.setNamespaceAware(namespaceAware); 091 return this; 092 } 093 094 /** 095 * Converts a jsoup DOM to a W3C DOM. 096 * 097 * @param in jsoup Document 098 * @return W3C Document 099 */ 100 public static Document convert(org.jsoup.nodes.Document in) { 101 return (new W3CDom().fromJsoup(in)); 102 } 103 104 /** 105 * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If 106 * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the 107 * document. 108 * 109 * @param doc Document 110 * @param properties (optional/nullable) the output properties to use. See {@link 111 * Transformer#setOutputProperties(Properties)} and {@link OutputKeys} 112 * @return Document as string 113 * @see #OutputHtml 114 * @see #OutputXml 115 * @see OutputKeys#ENCODING 116 * @see OutputKeys#OMIT_XML_DECLARATION 117 * @see OutputKeys#STANDALONE 118 * @see OutputKeys#STANDALONE 119 * @see OutputKeys#DOCTYPE_PUBLIC 120 * @see OutputKeys#CDATA_SECTION_ELEMENTS 121 * @see OutputKeys#INDENT 122 * @see OutputKeys#MEDIA_TYPE 123 */ 124 public static String asString(Document doc, @Nullable Map<String, String> properties) { 125 try { 126 DOMSource domSource = new DOMSource(doc); 127 StringWriter writer = new StringWriter(); 128 StreamResult result = new StreamResult(writer); 129 TransformerFactory tf = TransformerFactory.newInstance(); 130 Transformer transformer = tf.newTransformer(); 131 if (properties != null) 132 transformer.setOutputProperties(propertiesFromMap(properties)); 133 134 if (doc.getDoctype() != null) { 135 DocumentType doctype = doc.getDoctype(); 136 if (!StringUtil.isBlank(doctype.getPublicId())) 137 transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId()); 138 if (!StringUtil.isBlank(doctype.getSystemId())) 139 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); 140 // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html> 141 else if (doctype.getName().equalsIgnoreCase("html") 142 && StringUtil.isBlank(doctype.getPublicId()) 143 && StringUtil.isBlank(doctype.getSystemId())) 144 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat"); 145 } 146 147 transformer.transform(domSource, result); 148 return writer.toString(); 149 150 } catch (TransformerException e) { 151 throw new IllegalStateException(e); 152 } 153 } 154 155 static Properties propertiesFromMap(Map<String, String> map) { 156 Properties props = new Properties(); 157 props.putAll(map); 158 return props; 159 } 160 161 /** Canned default for HTML output. */ 162 public static HashMap<String, String> OutputHtml() { 163 return methodMap("html"); 164 } 165 166 /** Canned default for XML output. */ 167 public static HashMap<String, String> OutputXml() { 168 return methodMap("xml"); 169 } 170 171 private static HashMap<String, String> methodMap(String method) { 172 HashMap<String, String> map = new HashMap<>(); 173 map.put(METHOD, method); 174 return map; 175 } 176 177 /** 178 * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original 179 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 180 * flow to the other). 181 * 182 * @param in jsoup doc 183 * @return a W3C DOM Document representing the jsoup Document or Element contents. 184 */ 185 public Document fromJsoup(org.jsoup.nodes.Document in) { 186 // just method API backcompat 187 return fromJsoup((org.jsoup.nodes.Element) in); 188 } 189 190 /** 191 * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original 192 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 193 * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is 194 * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) 195 * 196 * @param in jsoup element or doc 197 * @return a W3C DOM Document representing the jsoup Document or Element contents. 198 * @see #sourceNodes(NodeList, Class) 199 * @see #contextNode(Document) 200 */ 201 public Document fromJsoup(org.jsoup.nodes.Element in) { 202 Validate.notNull(in); 203 DocumentBuilder builder; 204 try { 205 builder = factory.newDocumentBuilder(); 206 DOMImplementation impl = builder.getDOMImplementation(); 207 Document out = builder.newDocument(); 208 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 209 org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; 210 if (doctype != null) { 211 try { 212 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId()); 213 out.appendChild(documentType); 214 } catch (DOMException ignored) { 215 // invalid / empty doctype dropped 216 } 217 } 218 out.setXmlStandalone(true); 219 // if in is Document, use the root element, not the wrapping document, as the context: 220 org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in; 221 out.setUserData(ContextProperty, context, null); 222 convert(inDoc != null ? inDoc : in, out); 223 return out; 224 } catch (ParserConfigurationException e) { 225 throw new IllegalStateException(e); 226 } 227 } 228 229 /** 230 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output 231 * document before converting. 232 * 233 * @param in jsoup doc 234 * @param out w3c doc 235 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 236 */ 237 public void convert(org.jsoup.nodes.Document in, Document out) { 238 // just provides method API backcompat 239 convert((org.jsoup.nodes.Element) in, out); 240 } 241 242 /** 243 * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output 244 * document before converting. 245 * 246 * @param in jsoup element 247 * @param out w3c doc 248 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 249 */ 250 public void convert(org.jsoup.nodes.Element in, Document out) { 251 W3CBuilder builder = new W3CBuilder(out); 252 builder.namespaceAware = namespaceAware; 253 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 254 if (inDoc != null) { 255 if (!StringUtil.isBlank(inDoc.location())) { 256 out.setDocumentURI(inDoc.location()); 257 } 258 builder.syntax = inDoc.outputSettings().syntax(); 259 } 260 org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document 261 NodeTraversor.traverse(builder, rootEl); 262 } 263 264 /** 265 Evaluate an XPath query against the supplied document, and return the results. 266 @param xpath an XPath query 267 @param doc the document to evaluate against 268 @return the matches nodes 269 */ 270 public NodeList selectXpath(String xpath, Document doc) { 271 return selectXpath(xpath, (Node) doc); 272 } 273 274 /** 275 Evaluate an XPath query against the supplied context node, and return the results. 276 @param xpath an XPath query 277 @param contextNode the context node to evaluate against 278 @return the matches nodes 279 */ 280 public NodeList selectXpath(String xpath, Node contextNode) { 281 Validate.notEmptyParam(xpath, "xpath"); 282 Validate.notNullParam(contextNode, "contextNode"); 283 284 NodeList nodeList; 285 try { 286 // if there is a configured XPath factory, use that instead of the Java base impl: 287 String property = System.getProperty(XPathFactoryProperty); 288 final XPathFactory xPathFactory = property != null ? 289 XPathFactory.newInstance("jsoup") : 290 XPathFactory.newInstance(); 291 292 XPathExpression expression = xPathFactory.newXPath().compile(xpath); 293 nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s 294 Validate.notNull(nodeList); 295 } catch (XPathExpressionException | XPathFactoryConfigurationException e) { 296 throw new Selector.SelectorParseException( 297 e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); 298 } 299 return nodeList; 300 } 301 302 /** 303 Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. 304 @param nodeList the W3C nodes to get the original jsoup nodes from 305 @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) 306 @param <T> node type 307 @return a list of the original nodes 308 */ 309 public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) { 310 Validate.notNull(nodeList); 311 Validate.notNull(nodeType); 312 List<T> nodes = new ArrayList<>(nodeList.getLength()); 313 314 for (int i = 0; i < nodeList.getLength(); i++) { 315 org.w3c.dom.Node node = nodeList.item(i); 316 Object source = node.getUserData(W3CDom.SourceProperty); 317 if (nodeType.isInstance(source)) 318 nodes.add(nodeType.cast(source)); 319 } 320 321 return nodes; 322 } 323 324 /** 325 For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. 326 @param wDoc Document created by this class 327 @return the corresponding W3C Node to the jsoup Element that was used as the creating context. 328 */ 329 public Node contextNode(Document wDoc) { 330 return (Node) wDoc.getUserData(ContextNodeProperty); 331 } 332 333 /** 334 * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc. 335 * 336 * @param doc Document 337 * @return Document as string 338 * @see W3CDom#asString(Document, Map) 339 */ 340 public String asString(Document doc) { 341 return asString(doc, null); 342 } 343 344 /** 345 * Implements the conversion by walking the input. 346 */ 347 protected static class W3CBuilder implements NodeVisitor { 348 // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces 349 private static final String xmlnsKey = "xmlns"; 350 private static final String xmlnsPrefix = "xmlns:"; 351 352 private final Document doc; 353 private boolean namespaceAware = true; 354 private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn 355 private Node dest; 356 private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. 357 /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable? 358 359 public W3CBuilder(Document doc) { 360 this.doc = doc; 361 namespacesStack.push(new HashMap<>()); 362 dest = doc; 363 contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element 364 if (contextElement != null) { 365 final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument(); 366 if ( namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder ) { 367 // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default 368 namespacesStack.peek().put("", Parser.NamespaceHtml); 369 } 370 } 371 } 372 373 @Override 374 public void head(org.jsoup.nodes.Node source, int depth) { 375 namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack 376 if (source instanceof org.jsoup.nodes.Element) { 377 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 378 379 String prefix = updateNamespaces(sourceEl); 380 String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null; 381 String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName()); 382 try { 383 // use an empty namespace if none is present but the tag name has a prefix 384 String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace; 385 Element el = doc.createElementNS(imputedNamespace, tagName); 386 copyAttributes(sourceEl, el); 387 append(el, sourceEl); 388 if (sourceEl == contextElement) 389 doc.setUserData(ContextNodeProperty, el, null); 390 dest = el; // descend 391 } catch (DOMException e) { 392 // If the Normalize didn't get it XML / W3C safe, inserts as plain text 393 append(doc.createTextNode("<" + tagName + ">"), sourceEl); 394 } 395 } else if (source instanceof org.jsoup.nodes.TextNode) { 396 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 397 Text text = doc.createTextNode(sourceText.getWholeText()); 398 append(text, sourceText); 399 } else if (source instanceof org.jsoup.nodes.Comment) { 400 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 401 Comment comment = doc.createComment(sourceComment.getData()); 402 append(comment, sourceComment); 403 } else if (source instanceof org.jsoup.nodes.DataNode) { 404 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 405 Text node = doc.createTextNode(sourceData.getWholeData()); 406 append(node, sourceData); 407 } else { 408 // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation 409 } 410 } 411 412 private void append(Node append, org.jsoup.nodes.Node source) { 413 append.setUserData(SourceProperty, source, null); 414 dest.appendChild(append); 415 } 416 417 @Override 418 public void tail(org.jsoup.nodes.Node source, int depth) { 419 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 420 dest = dest.getParentNode(); // undescend 421 } 422 namespacesStack.pop(); 423 } 424 425 private void copyAttributes(org.jsoup.nodes.Node source, Element el) { 426 for (Attribute attribute : source.attributes()) { 427 // the W3C DOM has a different allowed set of characters than HTML5 (that Attribute.getValidKey return, partic does not allow ';'). So if we except when using HTML, go to more restricted XML 428 try { 429 String key = Attribute.getValidKey(attribute.getKey(), syntax); 430 if (key != null) // null if couldn't be coerced to validity 431 el.setAttribute(key, attribute.getValue()); 432 } catch (DOMException e) { 433 if (syntax != Syntax.xml) { 434 String key = Attribute.getValidKey(attribute.getKey(), Syntax.xml); 435 if (key != null) 436 el.setAttribute(key, attribute.getValue()); // otherwise, will skip attribute 437 } 438 } 439 } 440 } 441 442 /** 443 * Finds any namespaces defined in this element. Returns any tag prefix. 444 */ 445 private String updateNamespaces(org.jsoup.nodes.Element el) { 446 // scan the element for namespace declarations 447 // like: xmlns="blah" or xmlns:prefix="blah" 448 Attributes attributes = el.attributes(); 449 for (Attribute attr : attributes) { 450 String key = attr.getKey(); 451 String prefix; 452 if (key.equals(xmlnsKey)) { 453 prefix = ""; 454 } else if (key.startsWith(xmlnsPrefix)) { 455 prefix = key.substring(xmlnsPrefix.length()); 456 } else { 457 continue; 458 } 459 namespacesStack.peek().put(prefix, attr.getValue()); 460 } 461 462 // get the element prefix if any 463 int pos = el.tagName().indexOf(':'); 464 return pos > 0 ? el.tagName().substring(0, pos) : ""; 465 } 466 467 } 468}