001package org.jsoup.helper;
002
003import org.jsoup.internal.Normalizer;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.parser.HtmlTreeBuilder;
008import org.jsoup.parser.Parser;
009import org.jsoup.select.NodeTraversor;
010import org.jsoup.select.NodeVisitor;
011import org.jsoup.select.Selector;
012import org.w3c.dom.Comment;
013import org.w3c.dom.DOMException;
014import org.w3c.dom.DOMImplementation;
015import org.w3c.dom.Document;
016import org.w3c.dom.DocumentType;
017import org.w3c.dom.Element;
018import org.w3c.dom.Node;
019import org.w3c.dom.NodeList;
020import org.w3c.dom.Text;
021import org.jspecify.annotations.Nullable;
022
023import javax.xml.parsers.DocumentBuilder;
024import javax.xml.parsers.DocumentBuilderFactory;
025import javax.xml.parsers.ParserConfigurationException;
026import javax.xml.transform.OutputKeys;
027import javax.xml.transform.Transformer;
028import javax.xml.transform.TransformerException;
029import javax.xml.transform.TransformerFactory;
030import javax.xml.transform.dom.DOMSource;
031import javax.xml.transform.stream.StreamResult;
032import javax.xml.xpath.XPathConstants;
033import javax.xml.xpath.XPathExpression;
034import javax.xml.xpath.XPathExpressionException;
035import javax.xml.xpath.XPathFactory;
036import javax.xml.xpath.XPathFactoryConfigurationException;
037import java.io.StringWriter;
038import java.util.ArrayDeque;
039import java.util.ArrayList;
040import java.util.HashMap;
041import java.util.List;
042import java.util.Map;
043import java.util.Properties;
044
045import static javax.xml.transform.OutputKeys.METHOD;
046import static org.jsoup.nodes.Document.OutputSettings.Syntax;
047
048/**
049 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
050 * for integration with toolsets that use the W3C DOM.
051 */
052public class W3CDom {
053    /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
054    public static final String SourceProperty = "jsoupSource";
055    private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
056    private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context
057
058    /**
059     To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory
060     implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}).
061     */
062    public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup";
063
064    protected DocumentBuilderFactory factory;
065    private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience
066
067    public W3CDom() {
068        factory = DocumentBuilderFactory.newInstance();
069        factory.setNamespaceAware(true);
070    }
071
072    /**
073     Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity
074     when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}.
075     @return the current namespace aware setting.
076     */
077    public boolean namespaceAware() {
078        return namespaceAware;
079    }
080
081    /**
082     Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes.
083     <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml}
084     namespace if otherwise unset.</p>.
085     @param namespaceAware the updated setting
086     @return this W3CDom, for chaining.
087     */
088    public W3CDom namespaceAware(boolean namespaceAware) {
089        this.namespaceAware = namespaceAware;
090        factory.setNamespaceAware(namespaceAware);
091        return this;
092    }
093
094    /**
095     * Converts a jsoup DOM to a W3C DOM.
096     *
097     * @param in jsoup Document
098     * @return W3C Document
099     */
100    public static Document convert(org.jsoup.nodes.Document in) {
101        return (new W3CDom().fromJsoup(in));
102    }
103
104    /**
105     * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If
106     * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the
107     * document.
108     *
109     * @param doc Document
110     * @param properties (optional/nullable) the output properties to use. See {@link
111     *     Transformer#setOutputProperties(Properties)} and {@link OutputKeys}
112     * @return Document as string
113     * @see #OutputHtml
114     * @see #OutputXml
115     * @see OutputKeys#ENCODING
116     * @see OutputKeys#OMIT_XML_DECLARATION
117     * @see OutputKeys#STANDALONE
118     * @see OutputKeys#STANDALONE
119     * @see OutputKeys#DOCTYPE_PUBLIC
120     * @see OutputKeys#CDATA_SECTION_ELEMENTS
121     * @see OutputKeys#INDENT
122     * @see OutputKeys#MEDIA_TYPE
123     */
124    public static String asString(Document doc, @Nullable Map<String, String> properties) {
125        try {
126            DOMSource domSource = new DOMSource(doc);
127            StringWriter writer = new StringWriter();
128            StreamResult result = new StreamResult(writer);
129            TransformerFactory tf = TransformerFactory.newInstance();
130            Transformer transformer = tf.newTransformer();
131            if (properties != null)
132                transformer.setOutputProperties(propertiesFromMap(properties));
133
134            if (doc.getDoctype() != null) {
135                DocumentType doctype = doc.getDoctype();
136                if (!StringUtil.isBlank(doctype.getPublicId()))
137                    transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId());
138                if (!StringUtil.isBlank(doctype.getSystemId()))
139                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId());
140                    // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html>
141                else if (doctype.getName().equalsIgnoreCase("html")
142                    && StringUtil.isBlank(doctype.getPublicId())
143                    && StringUtil.isBlank(doctype.getSystemId()))
144                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat");
145            }
146
147            transformer.transform(domSource, result);
148            return writer.toString();
149
150        } catch (TransformerException e) {
151            throw new IllegalStateException(e);
152        }
153    }
154
155    static Properties propertiesFromMap(Map<String, String> map) {
156        Properties props = new Properties();
157        props.putAll(map);
158        return props;
159    }
160
161    /** Canned default for HTML output. */
162    public static HashMap<String, String> OutputHtml() {
163        return methodMap("html");
164    }
165
166    /** Canned default for XML output. */
167    public static HashMap<String, String> OutputXml() {
168        return methodMap("xml");
169    }
170
171    private static HashMap<String, String> methodMap(String method) {
172        HashMap<String, String> map = new HashMap<>();
173        map.put(METHOD, method);
174        return map;
175    }
176
177    /**
178     * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original
179     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
180     * flow to the other).
181     *
182     * @param in jsoup doc
183     * @return a W3C DOM Document representing the jsoup Document or Element contents.
184     */
185    public Document fromJsoup(org.jsoup.nodes.Document in) {
186        // just method API backcompat
187        return fromJsoup((org.jsoup.nodes.Element) in);
188    }
189
190    /**
191     * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
192     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
193     * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
194     * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
195     *
196     * @param in jsoup element or doc
197     * @return a W3C DOM Document representing the jsoup Document or Element contents.
198     * @see #sourceNodes(NodeList, Class)
199     * @see #contextNode(Document)
200     */
201    public Document fromJsoup(org.jsoup.nodes.Element in) {
202        Validate.notNull(in);
203        DocumentBuilder builder;
204        try {
205            builder = factory.newDocumentBuilder();
206            DOMImplementation impl = builder.getDOMImplementation();
207            Document out = builder.newDocument();
208            org.jsoup.nodes.Document inDoc = in.ownerDocument();
209            org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
210            if (doctype != null) {
211                try {
212                    org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
213                    out.appendChild(documentType);
214                } catch (DOMException ignored) {
215                    // invalid / empty doctype dropped
216                }
217            }
218            out.setXmlStandalone(true);
219            // if in is Document, use the root element, not the wrapping document, as the context:
220            org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in;
221            out.setUserData(ContextProperty, context, null);
222            convert(inDoc != null ? inDoc : in, out);
223            return out;
224        } catch (ParserConfigurationException e) {
225            throw new IllegalStateException(e);
226        }
227    }
228
229    /**
230     * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output
231     * document before converting.
232     *
233     * @param in jsoup doc
234     * @param out w3c doc
235     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
236     */
237    public void convert(org.jsoup.nodes.Document in, Document out) {
238        // just provides method API backcompat
239        convert((org.jsoup.nodes.Element) in, out);
240    }
241
242    /**
243     * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output
244     * document before converting.
245     *
246     * @param in jsoup element
247     * @param out w3c doc
248     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
249     */
250    public void convert(org.jsoup.nodes.Element in, Document out) {
251        W3CBuilder builder = new W3CBuilder(out);
252        builder.namespaceAware = namespaceAware;
253        org.jsoup.nodes.Document inDoc = in.ownerDocument();
254        if (inDoc != null) {
255            if (!StringUtil.isBlank(inDoc.location())) {
256                out.setDocumentURI(inDoc.location());
257            }
258            builder.syntax = inDoc.outputSettings().syntax();
259        }
260        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document
261        NodeTraversor.traverse(builder, rootEl);
262    }
263
264    /**
265     Evaluate an XPath query against the supplied document, and return the results.
266     @param xpath an XPath query
267     @param doc the document to evaluate against
268     @return the matches nodes
269     */
270    public NodeList selectXpath(String xpath, Document doc) {
271        return selectXpath(xpath, (Node) doc);
272    }
273
274    /**
275     Evaluate an XPath query against the supplied context node, and return the results.
276     @param xpath an XPath query
277     @param contextNode the context node to evaluate against
278     @return the matches nodes
279     */
280    public NodeList selectXpath(String xpath, Node contextNode) {
281        Validate.notEmptyParam(xpath, "xpath");
282        Validate.notNullParam(contextNode, "contextNode");
283
284        NodeList nodeList;
285        try {
286            // if there is a configured XPath factory, use that instead of the Java base impl:
287            String property = System.getProperty(XPathFactoryProperty);
288            final XPathFactory xPathFactory = property != null ?
289                XPathFactory.newInstance("jsoup") :
290                XPathFactory.newInstance();
291
292            XPathExpression expression = xPathFactory.newXPath().compile(xpath);
293            nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
294            Validate.notNull(nodeList);
295        } catch (XPathExpressionException | XPathFactoryConfigurationException e) {
296            throw new Selector.SelectorParseException(
297                e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
298        }
299        return nodeList;
300    }
301
302    /**
303     Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
304     @param nodeList the W3C nodes to get the original jsoup nodes from
305     @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
306     @param <T> node type
307     @return a list of the original nodes
308     */
309    public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
310        Validate.notNull(nodeList);
311        Validate.notNull(nodeType);
312        List<T> nodes = new ArrayList<>(nodeList.getLength());
313
314        for (int i = 0; i < nodeList.getLength(); i++) {
315            org.w3c.dom.Node node = nodeList.item(i);
316            Object source = node.getUserData(W3CDom.SourceProperty);
317            if (nodeType.isInstance(source))
318                nodes.add(nodeType.cast(source));
319        }
320
321        return nodes;
322    }
323
324    /**
325     For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
326     @param wDoc Document created by this class
327     @return the corresponding W3C Node to the jsoup Element that was used as the creating context.
328     */
329    public Node contextNode(Document wDoc) {
330        return (Node) wDoc.getUserData(ContextNodeProperty);
331    }
332
333    /**
334     * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc.
335     *
336     * @param doc Document
337     * @return Document as string
338     * @see W3CDom#asString(Document, Map)
339     */
340    public String asString(Document doc) {
341        return asString(doc, null);
342    }
343
344    /**
345     * Implements the conversion by walking the input.
346     */
347    protected static class W3CBuilder implements NodeVisitor {
348        // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces
349        private static final String xmlnsKey = "xmlns";
350        private static final String xmlnsPrefix = "xmlns:";
351
352        private final Document doc;
353        private boolean namespaceAware = true;
354        private final ArrayDeque<HashMap<String, String>> namespacesStack = new ArrayDeque<>(); // stack of namespaces, prefix => urn
355        private Node dest;
356        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
357        /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable?
358
359        public W3CBuilder(Document doc) {
360            this.doc = doc;
361            namespacesStack.push(new HashMap<>());
362            dest = doc;
363            contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
364            if (contextElement != null) {
365                final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument();
366                if ( namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder ) {
367                    // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default
368                    namespacesStack.peek().put("", Parser.NamespaceHtml);
369                }
370            }
371        }
372
373        @Override
374        public void head(org.jsoup.nodes.Node source, int depth) {
375            namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack
376            if (source instanceof org.jsoup.nodes.Element) {
377                org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
378
379                String prefix = updateNamespaces(sourceEl);
380                String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null;
381                String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName());
382                try {
383                    // use an empty namespace if none is present but the tag name has a prefix
384                    String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
385                    Element el = doc.createElementNS(imputedNamespace, tagName);
386                    copyAttributes(sourceEl, el);
387                    append(el, sourceEl);
388                    if (sourceEl == contextElement)
389                        doc.setUserData(ContextNodeProperty, el, null);
390                    dest = el; // descend
391                } catch (DOMException e) {
392                    // If the Normalize didn't get it XML / W3C safe, inserts as plain text
393                    append(doc.createTextNode("<" + tagName + ">"), sourceEl);
394                }
395            } else if (source instanceof org.jsoup.nodes.TextNode) {
396                org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
397                Text text = doc.createTextNode(sourceText.getWholeText());
398                append(text, sourceText);
399            } else if (source instanceof org.jsoup.nodes.Comment) {
400                org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
401                Comment comment = doc.createComment(sourceComment.getData());
402                append(comment, sourceComment);
403            } else if (source instanceof org.jsoup.nodes.DataNode) {
404                org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
405                Text node = doc.createTextNode(sourceData.getWholeData());
406                append(node, sourceData);
407            } else {
408                // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation
409            }
410        }
411
412        private void append(Node append, org.jsoup.nodes.Node source) {
413            append.setUserData(SourceProperty, source, null);
414            dest.appendChild(append);
415        }
416
417        @Override
418        public void tail(org.jsoup.nodes.Node source, int depth) {
419            if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
420                dest = dest.getParentNode(); // undescend
421            }
422            namespacesStack.pop();
423        }
424
425        private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
426            for (Attribute attribute : source.attributes()) {
427                // the W3C DOM has a different allowed set of characters than HTML5 (that Attribute.getValidKey return, partic does not allow ';'). So if we except when using HTML, go to more restricted XML
428                try {
429                    String key = Attribute.getValidKey(attribute.getKey(), syntax);
430                    if (key != null) // null if couldn't be coerced to validity
431                        el.setAttribute(key, attribute.getValue());
432                } catch (DOMException e) {
433                    if (syntax != Syntax.xml) {
434                        String key = Attribute.getValidKey(attribute.getKey(), Syntax.xml);
435                        if (key != null)
436                            el.setAttribute(key, attribute.getValue()); // otherwise, will skip attribute
437                    }
438                }
439            }
440        }
441
442        /**
443         * Finds any namespaces defined in this element. Returns any tag prefix.
444         */
445        private String updateNamespaces(org.jsoup.nodes.Element el) {
446            // scan the element for namespace declarations
447            // like: xmlns="blah" or xmlns:prefix="blah"
448            Attributes attributes = el.attributes();
449            for (Attribute attr : attributes) {
450                String key = attr.getKey();
451                String prefix;
452                if (key.equals(xmlnsKey)) {
453                    prefix = "";
454                } else if (key.startsWith(xmlnsPrefix)) {
455                    prefix = key.substring(xmlnsPrefix.length());
456                } else {
457                    continue;
458                }
459                namespacesStack.peek().put(prefix, attr.getValue());
460            }
461
462            // get the element prefix if any
463            int pos = el.tagName().indexOf(':');
464            return pos > 0 ? el.tagName().substring(0, pos) : "";
465        }
466
467    }
468}