001package org.jsoup.helper; 002 003import org.jsoup.internal.Normalizer; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.nodes.Attribute; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.parser.HtmlTreeBuilder; 008import org.jsoup.parser.XmlTreeBuilder; 009import org.jsoup.select.NodeTraversor; 010import org.jsoup.select.NodeVisitor; 011import org.jsoup.select.Selector; 012import org.w3c.dom.Comment; 013import org.w3c.dom.DOMException; 014import org.w3c.dom.DOMImplementation; 015import org.w3c.dom.Document; 016import org.w3c.dom.DocumentType; 017import org.w3c.dom.Element; 018import org.w3c.dom.Node; 019import org.w3c.dom.NodeList; 020import org.w3c.dom.Text; 021import org.jspecify.annotations.Nullable; 022 023import javax.xml.parsers.DocumentBuilder; 024import javax.xml.parsers.DocumentBuilderFactory; 025import javax.xml.parsers.ParserConfigurationException; 026import javax.xml.transform.OutputKeys; 027import javax.xml.transform.Transformer; 028import javax.xml.transform.TransformerException; 029import javax.xml.transform.TransformerFactory; 030import javax.xml.transform.dom.DOMSource; 031import javax.xml.transform.stream.StreamResult; 032import javax.xml.xpath.XPathConstants; 033import javax.xml.xpath.XPathExpression; 034import javax.xml.xpath.XPathExpressionException; 035import javax.xml.xpath.XPathFactory; 036import javax.xml.xpath.XPathFactoryConfigurationException; 037import java.io.StringWriter; 038import java.util.ArrayList; 039import java.util.HashMap; 040import java.util.List; 041import java.util.Map; 042import java.util.Properties; 043 044import static javax.xml.transform.OutputKeys.METHOD; 045import static org.jsoup.nodes.Document.OutputSettings.Syntax; 046 047/** 048 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 049 * for integration with toolsets that use the W3C DOM. 050 */ 051public class W3CDom { 052 /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ 053 public static final String SourceProperty = "jsoupSource"; 054 private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc 055 private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context 056 057 /** 058 To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory 059 implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). 060 */ 061 public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; 062 063 protected DocumentBuilderFactory factory; 064 private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience 065 066 public W3CDom() { 067 factory = DocumentBuilderFactory.newInstance(); 068 factory.setNamespaceAware(true); 069 } 070 071 /** 072 Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity 073 when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. 074 @return the current namespace aware setting. 075 */ 076 public boolean namespaceAware() { 077 return namespaceAware; 078 } 079 080 /** 081 Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. 082 <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml} 083 namespace if otherwise unset.</p>. 084 @param namespaceAware the updated setting 085 @return this W3CDom, for chaining. 086 */ 087 public W3CDom namespaceAware(boolean namespaceAware) { 088 this.namespaceAware = namespaceAware; 089 factory.setNamespaceAware(namespaceAware); 090 return this; 091 } 092 093 /** 094 * Converts a jsoup DOM to a W3C DOM. 095 * 096 * @param in jsoup Document 097 * @return W3C Document 098 */ 099 public static Document convert(org.jsoup.nodes.Document in) { 100 return (new W3CDom().fromJsoup(in)); 101 } 102 103 /** 104 * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If 105 * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the 106 * document. 107 * 108 * @param doc Document 109 * @param properties (optional/nullable) the output properties to use. See {@link 110 * Transformer#setOutputProperties(Properties)} and {@link OutputKeys} 111 * @return Document as string 112 * @see #OutputHtml 113 * @see #OutputXml 114 * @see OutputKeys#ENCODING 115 * @see OutputKeys#OMIT_XML_DECLARATION 116 * @see OutputKeys#STANDALONE 117 * @see OutputKeys#DOCTYPE_PUBLIC 118 * @see OutputKeys#CDATA_SECTION_ELEMENTS 119 * @see OutputKeys#INDENT 120 * @see OutputKeys#MEDIA_TYPE 121 */ 122 public static String asString(Document doc, @Nullable Map<String, String> properties) { 123 try { 124 DOMSource domSource = new DOMSource(doc); 125 StringWriter writer = new StringWriter(); 126 StreamResult result = new StreamResult(writer); 127 TransformerFactory tf = TransformerFactory.newInstance(); 128 Transformer transformer = tf.newTransformer(); 129 if (properties != null) 130 transformer.setOutputProperties(propertiesFromMap(properties)); 131 132 if (doc.getDoctype() != null) { 133 DocumentType doctype = doc.getDoctype(); 134 if (!StringUtil.isBlank(doctype.getPublicId())) 135 transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId()); 136 if (!StringUtil.isBlank(doctype.getSystemId())) 137 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); 138 // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html> 139 else if (doctype.getName().equalsIgnoreCase("html") 140 && StringUtil.isBlank(doctype.getPublicId()) 141 && StringUtil.isBlank(doctype.getSystemId())) 142 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat"); 143 } 144 145 transformer.transform(domSource, result); 146 return writer.toString(); 147 148 } catch (TransformerException e) { 149 throw new IllegalStateException(e); 150 } 151 } 152 153 static Properties propertiesFromMap(Map<String, String> map) { 154 Properties props = new Properties(); 155 props.putAll(map); 156 return props; 157 } 158 159 /** Canned default for HTML output. */ 160 public static HashMap<String, String> OutputHtml() { 161 return methodMap("html"); 162 } 163 164 /** Canned default for XML output. */ 165 public static HashMap<String, String> OutputXml() { 166 return methodMap("xml"); 167 } 168 169 private static HashMap<String, String> methodMap(String method) { 170 HashMap<String, String> map = new HashMap<>(); 171 map.put(METHOD, method); 172 return map; 173 } 174 175 /** 176 * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original 177 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 178 * flow to the other). 179 * 180 * @param in jsoup doc 181 * @return a W3C DOM Document representing the jsoup Document or Element contents. 182 */ 183 public Document fromJsoup(org.jsoup.nodes.Document in) { 184 // just method API backcompat 185 return fromJsoup((org.jsoup.nodes.Element) in); 186 } 187 188 /** 189 * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original 190 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 191 * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is 192 * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) 193 * 194 * @param in jsoup element or doc 195 * @return a W3C DOM Document representing the jsoup Document or Element contents. 196 * @see #sourceNodes(NodeList, Class) 197 * @see #contextNode(Document) 198 */ 199 public Document fromJsoup(org.jsoup.nodes.Element in) { 200 Validate.notNull(in); 201 DocumentBuilder builder; 202 try { 203 builder = factory.newDocumentBuilder(); 204 DOMImplementation impl = builder.getDOMImplementation(); 205 Document out = builder.newDocument(); 206 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 207 org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; 208 if (doctype != null) { 209 try { 210 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId()); 211 out.appendChild(documentType); 212 } catch (DOMException ignored) { 213 // invalid / empty doctype dropped 214 } 215 } 216 out.setXmlStandalone(true); 217 // if in is Document, use the root element, not the wrapping document, as the context: 218 org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in; 219 out.setUserData(ContextProperty, context, null); 220 convert(inDoc != null ? inDoc : in, out); 221 return out; 222 } catch (ParserConfigurationException e) { 223 throw new IllegalStateException(e); 224 } 225 } 226 227 /** 228 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output 229 * document before converting. 230 * 231 * @param in jsoup doc 232 * @param out w3c doc 233 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 234 */ 235 public void convert(org.jsoup.nodes.Document in, Document out) { 236 // just provides method API backcompat 237 convert((org.jsoup.nodes.Element) in, out); 238 } 239 240 /** 241 * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output 242 * document before converting. 243 * 244 * @param in jsoup element 245 * @param out w3c doc 246 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 247 */ 248 public void convert(org.jsoup.nodes.Element in, Document out) { 249 W3CBuilder builder = new W3CBuilder(out); 250 builder.namespaceAware = namespaceAware; 251 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 252 if (inDoc != null) { 253 if (!StringUtil.isBlank(inDoc.location())) { 254 out.setDocumentURI(inDoc.location()); 255 } 256 builder.syntax = inDoc.outputSettings().syntax(); 257 } 258 org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document 259 NodeTraversor.traverse(builder, rootEl); 260 } 261 262 /** 263 Evaluate an XPath query against the supplied document, and return the results. 264 @param xpath an XPath query 265 @param doc the document to evaluate against 266 @return the matches nodes 267 */ 268 public NodeList selectXpath(String xpath, Document doc) { 269 return selectXpath(xpath, (Node) doc); 270 } 271 272 /** 273 Evaluate an XPath query against the supplied context node, and return the results. 274 @param xpath an XPath query 275 @param contextNode the context node to evaluate against 276 @return the matches nodes 277 */ 278 public NodeList selectXpath(String xpath, Node contextNode) { 279 Validate.notEmptyParam(xpath, "xpath"); 280 Validate.notNullParam(contextNode, "contextNode"); 281 282 NodeList nodeList; 283 try { 284 // if there is a configured XPath factory, use that instead of the Java base impl: 285 String property = System.getProperty(XPathFactoryProperty); 286 final XPathFactory xPathFactory = property != null ? 287 XPathFactory.newInstance("jsoup") : 288 XPathFactory.newInstance(); 289 290 XPathExpression expression = xPathFactory.newXPath().compile(xpath); 291 nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s 292 Validate.notNull(nodeList); 293 } catch (XPathExpressionException | XPathFactoryConfigurationException e) { 294 throw new Selector.SelectorParseException( 295 e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); 296 } 297 return nodeList; 298 } 299 300 /** 301 Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. 302 @param nodeList the W3C nodes to get the original jsoup nodes from 303 @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) 304 @param <T> node type 305 @return a list of the original nodes 306 */ 307 public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) { 308 Validate.notNull(nodeList); 309 Validate.notNull(nodeType); 310 List<T> nodes = new ArrayList<>(nodeList.getLength()); 311 312 for (int i = 0; i < nodeList.getLength(); i++) { 313 org.w3c.dom.Node node = nodeList.item(i); 314 Object source = node.getUserData(W3CDom.SourceProperty); 315 if (nodeType.isInstance(source)) 316 nodes.add(nodeType.cast(source)); 317 } 318 319 return nodes; 320 } 321 322 /** 323 For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. 324 @param wDoc Document created by this class 325 @return the corresponding W3C Node to the jsoup Element that was used as the creating context. 326 */ 327 public Node contextNode(Document wDoc) { 328 return (Node) wDoc.getUserData(ContextNodeProperty); 329 } 330 331 /** 332 * Serialize a W3C document that was created by {@link #fromJsoup(org.jsoup.nodes.Element)} to a String. 333 * The output format will be XML or HTML depending on the content of the doc. 334 * 335 * @param doc Document 336 * @return Document as string 337 * @see W3CDom#asString(Document, Map) 338 */ 339 public String asString(Document doc) { 340 return asString(doc, null); 341 } 342 343 /** 344 * Implements the conversion by walking the input. 345 */ 346 protected static class W3CBuilder implements NodeVisitor { 347 private final Document doc; 348 private boolean namespaceAware = true; 349 private Node dest; 350 private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. 351 /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable? 352 353 public W3CBuilder(Document doc) { 354 this.doc = doc; 355 dest = doc; 356 contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element 357 } 358 359 @Override 360 public void head(org.jsoup.nodes.Node source, int depth) { 361 if (source instanceof org.jsoup.nodes.Element) { 362 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 363 String namespace = namespaceAware ? sourceEl.tag().namespace() : null; 364 String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName()); 365 try { 366 // use an empty namespace if none is present but the tag name has a prefix 367 String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace; 368 Element el = doc.createElementNS(imputedNamespace, tagName); 369 copyAttributes(sourceEl, el); 370 append(el, sourceEl); 371 if (sourceEl == contextElement) 372 doc.setUserData(ContextNodeProperty, el, null); 373 dest = el; // descend 374 } catch (DOMException e) { 375 // If the Normalize didn't get it XML / W3C safe, inserts as plain text 376 append(doc.createTextNode("<" + tagName + ">"), sourceEl); 377 } 378 } else if (source instanceof org.jsoup.nodes.TextNode) { 379 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 380 Text text = doc.createTextNode(sourceText.getWholeText()); 381 append(text, sourceText); 382 } else if (source instanceof org.jsoup.nodes.Comment) { 383 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 384 Comment comment = doc.createComment(sourceComment.getData()); 385 append(comment, sourceComment); 386 } else if (source instanceof org.jsoup.nodes.DataNode) { 387 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 388 Text node = doc.createTextNode(sourceData.getWholeData()); 389 append(node, sourceData); 390 } else { 391 // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation 392 } 393 } 394 395 private void append(Node append, org.jsoup.nodes.Node source) { 396 append.setUserData(SourceProperty, source, null); 397 dest.appendChild(append); 398 } 399 400 @Override 401 public void tail(org.jsoup.nodes.Node source, int depth) { 402 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 403 dest = dest.getParentNode(); // undescend 404 } 405 } 406 407 private void copyAttributes(org.jsoup.nodes.Element jEl, Element wEl) { 408 for (Attribute attribute : jEl.attributes()) { 409 try { 410 setAttribute(jEl, wEl, attribute, syntax); 411 } catch (DOMException e) { 412 if (syntax != Syntax.xml) 413 setAttribute(jEl, wEl, attribute, Syntax.xml); 414 } 415 } 416 } 417 418 private void setAttribute(org.jsoup.nodes.Element jEl, Element wEl, Attribute attribute, Syntax syntax) throws DOMException { 419 String key = Attribute.getValidKey(attribute.getKey(), syntax); 420 if (key != null) { 421 String namespace = attribute.namespace(); 422 if (namespaceAware && !namespace.isEmpty()) 423 wEl.setAttributeNS(namespace, key, attribute.getValue()); 424 else 425 wEl.setAttribute(key, attribute.getValue()); 426 maybeAddUndeclaredNs(namespace, key, jEl, wEl); 427 } 428 } 429 430 /** 431 Add a namespace declaration for an attribute with a prefix if it is not already present. Ensures that attributes 432 with prefixes have the corresponding namespace declared, E.g. attribute "v-bind:foo" gets another attribute 433 "xmlns:v-bind='undefined'. So that the asString() transformation pass is valid. 434 If the parser was HTML we don't have a discovered namespace but we are trying to coerce it, so walk up the 435 element stack and find it. 436 */ 437 private void maybeAddUndeclaredNs(String namespace, String attrKey, org.jsoup.nodes.Element jEl, Element wEl) { 438 if (!namespaceAware || !namespace.isEmpty()) return; 439 int pos = attrKey.indexOf(':'); 440 if (pos != -1) { // prefixed but no namespace defined during parse, add a fake so that w3c serialization doesn't blow up 441 String prefix = attrKey.substring(0, pos); 442 if (prefix.equals("xmlns")) return; 443 org.jsoup.nodes.Document doc = jEl.ownerDocument(); 444 if (doc != null && doc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) { 445 // try walking up the stack and seeing if there is a namespace declared for this prefix (and that we didn't parse because HTML) 446 for (org.jsoup.nodes.Element el = jEl; el != null; el = el.parent()) { 447 String ns = el.attr("xmlns:" + prefix); 448 if (!ns.isEmpty()) { 449 namespace = ns; 450 // found it, set it 451 wEl.setAttributeNS(namespace, attrKey, jEl.attr(attrKey)); 452 return; 453 } 454 } 455 } 456 457 // otherwise, put in a fake one 458 wEl.setAttribute("xmlns:" + prefix, undefinedNs); 459 } 460 } 461 private static final String undefinedNs = "undefined"; 462 } 463 464}