001package org.jsoup.helper; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.parser.HtmlTreeBuilder; 007import org.jsoup.parser.Parser; 008import org.jsoup.select.NodeTraversor; 009import org.jsoup.select.NodeVisitor; 010import org.jsoup.select.Selector; 011import org.w3c.dom.Comment; 012import org.w3c.dom.DOMException; 013import org.w3c.dom.DOMImplementation; 014import org.w3c.dom.Document; 015import org.w3c.dom.DocumentType; 016import org.w3c.dom.Element; 017import org.w3c.dom.Node; 018import org.w3c.dom.NodeList; 019import org.w3c.dom.Text; 020import org.jspecify.annotations.Nullable; 021 022import javax.xml.parsers.DocumentBuilder; 023import javax.xml.parsers.DocumentBuilderFactory; 024import javax.xml.parsers.ParserConfigurationException; 025import javax.xml.transform.OutputKeys; 026import javax.xml.transform.Transformer; 027import javax.xml.transform.TransformerException; 028import javax.xml.transform.TransformerFactory; 029import javax.xml.transform.dom.DOMSource; 030import javax.xml.transform.stream.StreamResult; 031import javax.xml.xpath.XPathConstants; 032import javax.xml.xpath.XPathExpression; 033import javax.xml.xpath.XPathExpressionException; 034import javax.xml.xpath.XPathFactory; 035import javax.xml.xpath.XPathFactoryConfigurationException; 036import java.io.StringWriter; 037import java.util.ArrayList; 038import java.util.HashMap; 039import java.util.List; 040import java.util.Map; 041import java.util.Properties; 042import java.util.Stack; 043 044import static javax.xml.transform.OutputKeys.METHOD; 045import static org.jsoup.nodes.Document.OutputSettings.Syntax; 046 047/** 048 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 049 * for integration with toolsets that use the W3C DOM. 050 */ 051public class W3CDom { 052 /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ 053 public static final String SourceProperty = "jsoupSource"; 054 private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc 055 private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context 056 057 /** 058 To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory 059 implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). 060 */ 061 public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; 062 063 protected DocumentBuilderFactory factory; 064 private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience 065 066 public W3CDom() { 067 factory = DocumentBuilderFactory.newInstance(); 068 factory.setNamespaceAware(true); 069 } 070 071 /** 072 Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity 073 when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. 074 @return the current namespace aware setting. 075 */ 076 public boolean namespaceAware() { 077 return namespaceAware; 078 } 079 080 /** 081 Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. 082 <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml} 083 namespace if otherwise unset.</p>. 084 @param namespaceAware the updated setting 085 @return this W3CDom, for chaining. 086 */ 087 public W3CDom namespaceAware(boolean namespaceAware) { 088 this.namespaceAware = namespaceAware; 089 factory.setNamespaceAware(namespaceAware); 090 return this; 091 } 092 093 /** 094 * Converts a jsoup DOM to a W3C DOM. 095 * 096 * @param in jsoup Document 097 * @return W3C Document 098 */ 099 public static Document convert(org.jsoup.nodes.Document in) { 100 return (new W3CDom().fromJsoup(in)); 101 } 102 103 /** 104 * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If 105 * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the 106 * document. 107 * 108 * @param doc Document 109 * @param properties (optional/nullable) the output properties to use. See {@link 110 * Transformer#setOutputProperties(Properties)} and {@link OutputKeys} 111 * @return Document as string 112 * @see #OutputHtml 113 * @see #OutputXml 114 * @see OutputKeys#ENCODING 115 * @see OutputKeys#OMIT_XML_DECLARATION 116 * @see OutputKeys#STANDALONE 117 * @see OutputKeys#STANDALONE 118 * @see OutputKeys#DOCTYPE_PUBLIC 119 * @see OutputKeys#CDATA_SECTION_ELEMENTS 120 * @see OutputKeys#INDENT 121 * @see OutputKeys#MEDIA_TYPE 122 */ 123 public static String asString(Document doc, @Nullable Map<String, String> properties) { 124 try { 125 DOMSource domSource = new DOMSource(doc); 126 StringWriter writer = new StringWriter(); 127 StreamResult result = new StreamResult(writer); 128 TransformerFactory tf = TransformerFactory.newInstance(); 129 Transformer transformer = tf.newTransformer(); 130 if (properties != null) 131 transformer.setOutputProperties(propertiesFromMap(properties)); 132 133 if (doc.getDoctype() != null) { 134 DocumentType doctype = doc.getDoctype(); 135 if (!StringUtil.isBlank(doctype.getPublicId())) 136 transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId()); 137 if (!StringUtil.isBlank(doctype.getSystemId())) 138 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); 139 // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html> 140 else if (doctype.getName().equalsIgnoreCase("html") 141 && StringUtil.isBlank(doctype.getPublicId()) 142 && StringUtil.isBlank(doctype.getSystemId())) 143 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat"); 144 } 145 146 transformer.transform(domSource, result); 147 return writer.toString(); 148 149 } catch (TransformerException e) { 150 throw new IllegalStateException(e); 151 } 152 } 153 154 static Properties propertiesFromMap(Map<String, String> map) { 155 Properties props = new Properties(); 156 props.putAll(map); 157 return props; 158 } 159 160 /** Canned default for HTML output. */ 161 public static HashMap<String, String> OutputHtml() { 162 return methodMap("html"); 163 } 164 165 /** Canned default for XML output. */ 166 public static HashMap<String, String> OutputXml() { 167 return methodMap("xml"); 168 } 169 170 private static HashMap<String, String> methodMap(String method) { 171 HashMap<String, String> map = new HashMap<>(); 172 map.put(METHOD, method); 173 return map; 174 } 175 176 /** 177 * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original 178 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 179 * flow to the other). 180 * 181 * @param in jsoup doc 182 * @return a W3C DOM Document representing the jsoup Document or Element contents. 183 */ 184 public Document fromJsoup(org.jsoup.nodes.Document in) { 185 // just method API backcompat 186 return fromJsoup((org.jsoup.nodes.Element) in); 187 } 188 189 /** 190 * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original 191 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 192 * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is 193 * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) 194 * 195 * @param in jsoup element or doc 196 * @return a W3C DOM Document representing the jsoup Document or Element contents. 197 * @see #sourceNodes(NodeList, Class) 198 * @see #contextNode(Document) 199 */ 200 public Document fromJsoup(org.jsoup.nodes.Element in) { 201 Validate.notNull(in); 202 DocumentBuilder builder; 203 try { 204 builder = factory.newDocumentBuilder(); 205 DOMImplementation impl = builder.getDOMImplementation(); 206 Document out = builder.newDocument(); 207 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 208 org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; 209 if (doctype != null) { 210 try { 211 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId()); 212 out.appendChild(documentType); 213 } catch (DOMException ignored) { 214 // invalid / empty doctype dropped 215 } 216 } 217 out.setXmlStandalone(true); 218 // if in is Document, use the root element, not the wrapping document, as the context: 219 org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in; 220 out.setUserData(ContextProperty, context, null); 221 convert(inDoc != null ? inDoc : in, out); 222 return out; 223 } catch (ParserConfigurationException e) { 224 throw new IllegalStateException(e); 225 } 226 } 227 228 /** 229 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output 230 * document before converting. 231 * 232 * @param in jsoup doc 233 * @param out w3c doc 234 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 235 */ 236 public void convert(org.jsoup.nodes.Document in, Document out) { 237 // just provides method API backcompat 238 convert((org.jsoup.nodes.Element) in, out); 239 } 240 241 /** 242 * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output 243 * document before converting. 244 * 245 * @param in jsoup element 246 * @param out w3c doc 247 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 248 */ 249 public void convert(org.jsoup.nodes.Element in, Document out) { 250 W3CBuilder builder = new W3CBuilder(out); 251 builder.namespaceAware = namespaceAware; 252 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 253 if (inDoc != null) { 254 if (!StringUtil.isBlank(inDoc.location())) { 255 out.setDocumentURI(inDoc.location()); 256 } 257 builder.syntax = inDoc.outputSettings().syntax(); 258 } 259 org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document 260 NodeTraversor.traverse(builder, rootEl); 261 } 262 263 /** 264 Evaluate an XPath query against the supplied document, and return the results. 265 @param xpath an XPath query 266 @param doc the document to evaluate against 267 @return the matches nodes 268 */ 269 public NodeList selectXpath(String xpath, Document doc) { 270 return selectXpath(xpath, (Node) doc); 271 } 272 273 /** 274 Evaluate an XPath query against the supplied context node, and return the results. 275 @param xpath an XPath query 276 @param contextNode the context node to evaluate against 277 @return the matches nodes 278 */ 279 public NodeList selectXpath(String xpath, Node contextNode) { 280 Validate.notEmptyParam(xpath, "xpath"); 281 Validate.notNullParam(contextNode, "contextNode"); 282 283 NodeList nodeList; 284 try { 285 // if there is a configured XPath factory, use that instead of the Java base impl: 286 String property = System.getProperty(XPathFactoryProperty); 287 final XPathFactory xPathFactory = property != null ? 288 XPathFactory.newInstance("jsoup") : 289 XPathFactory.newInstance(); 290 291 XPathExpression expression = xPathFactory.newXPath().compile(xpath); 292 nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s 293 Validate.notNull(nodeList); 294 } catch (XPathExpressionException | XPathFactoryConfigurationException e) { 295 throw new Selector.SelectorParseException( 296 e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); 297 } 298 return nodeList; 299 } 300 301 /** 302 Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. 303 @param nodeList the W3C nodes to get the original jsoup nodes from 304 @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) 305 @param <T> node type 306 @return a list of the original nodes 307 */ 308 public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) { 309 Validate.notNull(nodeList); 310 Validate.notNull(nodeType); 311 List<T> nodes = new ArrayList<>(nodeList.getLength()); 312 313 for (int i = 0; i < nodeList.getLength(); i++) { 314 org.w3c.dom.Node node = nodeList.item(i); 315 Object source = node.getUserData(W3CDom.SourceProperty); 316 if (nodeType.isInstance(source)) 317 nodes.add(nodeType.cast(source)); 318 } 319 320 return nodes; 321 } 322 323 /** 324 For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. 325 @param wDoc Document created by this class 326 @return the corresponding W3C Node to the jsoup Element that was used as the creating context. 327 */ 328 public Node contextNode(Document wDoc) { 329 return (Node) wDoc.getUserData(ContextNodeProperty); 330 } 331 332 /** 333 * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc. 334 * 335 * @param doc Document 336 * @return Document as string 337 * @see W3CDom#asString(Document, Map) 338 */ 339 public String asString(Document doc) { 340 return asString(doc, null); 341 } 342 343 /** 344 * Implements the conversion by walking the input. 345 */ 346 protected static class W3CBuilder implements NodeVisitor { 347 // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces 348 private static final String xmlnsKey = "xmlns"; 349 private static final String xmlnsPrefix = "xmlns:"; 350 351 private final Document doc; 352 private boolean namespaceAware = true; 353 private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn 354 private Node dest; 355 private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. 356 /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable? 357 358 public W3CBuilder(Document doc) { 359 this.doc = doc; 360 namespacesStack.push(new HashMap<>()); 361 dest = doc; 362 contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element 363 if (contextElement != null) { 364 final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument(); 365 if ( namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder ) { 366 // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default 367 namespacesStack.peek().put("", Parser.NamespaceHtml); 368 } 369 } 370 } 371 372 public void head(org.jsoup.nodes.Node source, int depth) { 373 namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack 374 if (source instanceof org.jsoup.nodes.Element) { 375 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 376 377 String prefix = updateNamespaces(sourceEl); 378 String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null; 379 String tagName = sourceEl.tagName(); 380 381 /* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation, 382 we just try to use it as-is. If it fails, insert as a text node instead. We don't try to normalize the 383 tagname to something safe, because that isn't going to be meaningful downstream. This seems(?) to be 384 how browsers handle the situation, also. https://github.com/jhy/jsoup/issues/1093 */ 385 try { 386 // use an empty namespace if none is present but the tag name has a prefix 387 String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace; 388 Element el = doc.createElementNS(imputedNamespace, tagName); 389 copyAttributes(sourceEl, el); 390 append(el, sourceEl); 391 if (sourceEl == contextElement) 392 doc.setUserData(ContextNodeProperty, el, null); 393 dest = el; // descend 394 } catch (DOMException e) { 395 append(doc.createTextNode("<" + tagName + ">"), sourceEl); 396 } 397 } else if (source instanceof org.jsoup.nodes.TextNode) { 398 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 399 Text text = doc.createTextNode(sourceText.getWholeText()); 400 append(text, sourceText); 401 } else if (source instanceof org.jsoup.nodes.Comment) { 402 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 403 Comment comment = doc.createComment(sourceComment.getData()); 404 append(comment, sourceComment); 405 } else if (source instanceof org.jsoup.nodes.DataNode) { 406 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 407 Text node = doc.createTextNode(sourceData.getWholeData()); 408 append(node, sourceData); 409 } else { 410 // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation 411 } 412 } 413 414 private void append(Node append, org.jsoup.nodes.Node source) { 415 append.setUserData(SourceProperty, source, null); 416 dest.appendChild(append); 417 } 418 419 public void tail(org.jsoup.nodes.Node source, int depth) { 420 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 421 dest = dest.getParentNode(); // undescend 422 } 423 namespacesStack.pop(); 424 } 425 426 private void copyAttributes(org.jsoup.nodes.Node source, Element el) { 427 for (Attribute attribute : source.attributes()) { 428 String key = Attribute.getValidKey(attribute.getKey(), syntax); 429 if (key != null) { // null if couldn't be coerced to validity 430 el.setAttribute(key, attribute.getValue()); 431 } 432 } 433 } 434 435 /** 436 * Finds any namespaces defined in this element. Returns any tag prefix. 437 */ 438 private String updateNamespaces(org.jsoup.nodes.Element el) { 439 // scan the element for namespace declarations 440 // like: xmlns="blah" or xmlns:prefix="blah" 441 Attributes attributes = el.attributes(); 442 for (Attribute attr : attributes) { 443 String key = attr.getKey(); 444 String prefix; 445 if (key.equals(xmlnsKey)) { 446 prefix = ""; 447 } else if (key.startsWith(xmlnsPrefix)) { 448 prefix = key.substring(xmlnsPrefix.length()); 449 } else { 450 continue; 451 } 452 namespacesStack.peek().put(prefix, attr.getValue()); 453 } 454 455 // get the element prefix if any 456 int pos = el.tagName().indexOf(':'); 457 return pos > 0 ? el.tagName().substring(0, pos) : ""; 458 } 459 460 } 461}