001package org.jsoup.helper;
002
003import org.jsoup.internal.Normalizer;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.nodes.Attribute;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.parser.HtmlTreeBuilder;
008import org.jsoup.parser.XmlTreeBuilder;
009import org.jsoup.select.NodeTraversor;
010import org.jsoup.select.NodeVisitor;
011import org.jsoup.select.Selector;
012import org.w3c.dom.Comment;
013import org.w3c.dom.DOMException;
014import org.w3c.dom.DOMImplementation;
015import org.w3c.dom.Document;
016import org.w3c.dom.DocumentType;
017import org.w3c.dom.Element;
018import org.w3c.dom.Node;
019import org.w3c.dom.NodeList;
020import org.w3c.dom.Text;
021import org.jspecify.annotations.Nullable;
022
023import javax.xml.parsers.DocumentBuilder;
024import javax.xml.parsers.DocumentBuilderFactory;
025import javax.xml.parsers.ParserConfigurationException;
026import javax.xml.transform.OutputKeys;
027import javax.xml.transform.Transformer;
028import javax.xml.transform.TransformerException;
029import javax.xml.transform.TransformerFactory;
030import javax.xml.transform.dom.DOMSource;
031import javax.xml.transform.stream.StreamResult;
032import javax.xml.xpath.XPathConstants;
033import javax.xml.xpath.XPathExpression;
034import javax.xml.xpath.XPathExpressionException;
035import javax.xml.xpath.XPathFactory;
036import javax.xml.xpath.XPathFactoryConfigurationException;
037import java.io.StringWriter;
038import java.util.ArrayList;
039import java.util.HashMap;
040import java.util.List;
041import java.util.Map;
042import java.util.Properties;
043
044import static javax.xml.transform.OutputKeys.METHOD;
045import static org.jsoup.nodes.Document.OutputSettings.Syntax;
046
047/**
048 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
049 * for integration with toolsets that use the W3C DOM.
050 */
051public class W3CDom {
052    /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
053    public static final String SourceProperty = "jsoupSource";
054    private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
055    private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context
056
057    /**
058     To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory
059     implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}).
060     */
061    public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup";
062
063    protected DocumentBuilderFactory factory;
064    private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience
065
066    public W3CDom() {
067        factory = DocumentBuilderFactory.newInstance();
068        factory.setNamespaceAware(true);
069    }
070
071    /**
072     Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity
073     when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}.
074     @return the current namespace aware setting.
075     */
076    public boolean namespaceAware() {
077        return namespaceAware;
078    }
079
080    /**
081     Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes.
082     <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml}
083     namespace if otherwise unset.</p>.
084     @param namespaceAware the updated setting
085     @return this W3CDom, for chaining.
086     */
087    public W3CDom namespaceAware(boolean namespaceAware) {
088        this.namespaceAware = namespaceAware;
089        factory.setNamespaceAware(namespaceAware);
090        return this;
091    }
092
093    /**
094     * Converts a jsoup DOM to a W3C DOM.
095     *
096     * @param in jsoup Document
097     * @return W3C Document
098     */
099    public static Document convert(org.jsoup.nodes.Document in) {
100        return (new W3CDom().fromJsoup(in));
101    }
102
103    /**
104     * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If
105     * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the
106     * document.
107     *
108     * @param doc Document
109     * @param properties (optional/nullable) the output properties to use. See {@link
110     *     Transformer#setOutputProperties(Properties)} and {@link OutputKeys}
111     * @return Document as string
112     * @see #OutputHtml
113     * @see #OutputXml
114     * @see OutputKeys#ENCODING
115     * @see OutputKeys#OMIT_XML_DECLARATION
116     * @see OutputKeys#STANDALONE
117     * @see OutputKeys#DOCTYPE_PUBLIC
118     * @see OutputKeys#CDATA_SECTION_ELEMENTS
119     * @see OutputKeys#INDENT
120     * @see OutputKeys#MEDIA_TYPE
121     */
122    public static String asString(Document doc, @Nullable Map<String, String> properties) {
123        try {
124            DOMSource domSource = new DOMSource(doc);
125            StringWriter writer = new StringWriter();
126            StreamResult result = new StreamResult(writer);
127            TransformerFactory tf = TransformerFactory.newInstance();
128            Transformer transformer = tf.newTransformer();
129            if (properties != null)
130                transformer.setOutputProperties(propertiesFromMap(properties));
131
132            if (doc.getDoctype() != null) {
133                DocumentType doctype = doc.getDoctype();
134                if (!StringUtil.isBlank(doctype.getPublicId()))
135                    transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId());
136                if (!StringUtil.isBlank(doctype.getSystemId()))
137                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId());
138                    // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html>
139                else if (doctype.getName().equalsIgnoreCase("html")
140                    && StringUtil.isBlank(doctype.getPublicId())
141                    && StringUtil.isBlank(doctype.getSystemId()))
142                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat");
143            }
144
145            transformer.transform(domSource, result);
146            return writer.toString();
147
148        } catch (TransformerException e) {
149            throw new IllegalStateException(e);
150        }
151    }
152
153    static Properties propertiesFromMap(Map<String, String> map) {
154        Properties props = new Properties();
155        props.putAll(map);
156        return props;
157    }
158
159    /** Canned default for HTML output. */
160    public static HashMap<String, String> OutputHtml() {
161        return methodMap("html");
162    }
163
164    /** Canned default for XML output. */
165    public static HashMap<String, String> OutputXml() {
166        return methodMap("xml");
167    }
168
169    private static HashMap<String, String> methodMap(String method) {
170        HashMap<String, String> map = new HashMap<>();
171        map.put(METHOD, method);
172        return map;
173    }
174
175    /**
176     * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original
177     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
178     * flow to the other).
179     *
180     * @param in jsoup doc
181     * @return a W3C DOM Document representing the jsoup Document or Element contents.
182     */
183    public Document fromJsoup(org.jsoup.nodes.Document in) {
184        // just method API backcompat
185        return fromJsoup((org.jsoup.nodes.Element) in);
186    }
187
188    /**
189     * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
190     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
191     * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
192     * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
193     *
194     * @param in jsoup element or doc
195     * @return a W3C DOM Document representing the jsoup Document or Element contents.
196     * @see #sourceNodes(NodeList, Class)
197     * @see #contextNode(Document)
198     */
199    public Document fromJsoup(org.jsoup.nodes.Element in) {
200        Validate.notNull(in);
201        DocumentBuilder builder;
202        try {
203            builder = factory.newDocumentBuilder();
204            DOMImplementation impl = builder.getDOMImplementation();
205            Document out = builder.newDocument();
206            org.jsoup.nodes.Document inDoc = in.ownerDocument();
207            org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
208            if (doctype != null) {
209                try {
210                    org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
211                    out.appendChild(documentType);
212                } catch (DOMException ignored) {
213                    // invalid / empty doctype dropped
214                }
215            }
216            out.setXmlStandalone(true);
217            // if in is Document, use the root element, not the wrapping document, as the context:
218            org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in;
219            out.setUserData(ContextProperty, context, null);
220            convert(inDoc != null ? inDoc : in, out);
221            return out;
222        } catch (ParserConfigurationException e) {
223            throw new IllegalStateException(e);
224        }
225    }
226
227    /**
228     * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output
229     * document before converting.
230     *
231     * @param in jsoup doc
232     * @param out w3c doc
233     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
234     */
235    public void convert(org.jsoup.nodes.Document in, Document out) {
236        // just provides method API backcompat
237        convert((org.jsoup.nodes.Element) in, out);
238    }
239
240    /**
241     * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output
242     * document before converting.
243     *
244     * @param in jsoup element
245     * @param out w3c doc
246     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
247     */
248    public void convert(org.jsoup.nodes.Element in, Document out) {
249        W3CBuilder builder = new W3CBuilder(out);
250        builder.namespaceAware = namespaceAware;
251        org.jsoup.nodes.Document inDoc = in.ownerDocument();
252        if (inDoc != null) {
253            if (!StringUtil.isBlank(inDoc.location())) {
254                out.setDocumentURI(inDoc.location());
255            }
256            builder.syntax = inDoc.outputSettings().syntax();
257        }
258        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document
259        NodeTraversor.traverse(builder, rootEl);
260    }
261
262    /**
263     Evaluate an XPath query against the supplied document, and return the results.
264     @param xpath an XPath query
265     @param doc the document to evaluate against
266     @return the matches nodes
267     */
268    public NodeList selectXpath(String xpath, Document doc) {
269        return selectXpath(xpath, (Node) doc);
270    }
271
272    /**
273     Evaluate an XPath query against the supplied context node, and return the results.
274     @param xpath an XPath query
275     @param contextNode the context node to evaluate against
276     @return the matches nodes
277     */
278    public NodeList selectXpath(String xpath, Node contextNode) {
279        Validate.notEmptyParam(xpath, "xpath");
280        Validate.notNullParam(contextNode, "contextNode");
281
282        NodeList nodeList;
283        try {
284            // if there is a configured XPath factory, use that instead of the Java base impl:
285            String property = System.getProperty(XPathFactoryProperty);
286            final XPathFactory xPathFactory = property != null ?
287                XPathFactory.newInstance("jsoup") :
288                XPathFactory.newInstance();
289
290            XPathExpression expression = xPathFactory.newXPath().compile(xpath);
291            nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
292            Validate.notNull(nodeList);
293        } catch (XPathExpressionException | XPathFactoryConfigurationException e) {
294            throw new Selector.SelectorParseException(
295                e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
296        }
297        return nodeList;
298    }
299
300    /**
301     Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
302     @param nodeList the W3C nodes to get the original jsoup nodes from
303     @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
304     @param <T> node type
305     @return a list of the original nodes
306     */
307    public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
308        Validate.notNull(nodeList);
309        Validate.notNull(nodeType);
310        List<T> nodes = new ArrayList<>(nodeList.getLength());
311
312        for (int i = 0; i < nodeList.getLength(); i++) {
313            org.w3c.dom.Node node = nodeList.item(i);
314            Object source = node.getUserData(W3CDom.SourceProperty);
315            if (nodeType.isInstance(source))
316                nodes.add(nodeType.cast(source));
317        }
318
319        return nodes;
320    }
321
322    /**
323     For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
324     @param wDoc Document created by this class
325     @return the corresponding W3C Node to the jsoup Element that was used as the creating context.
326     */
327    public Node contextNode(Document wDoc) {
328        return (Node) wDoc.getUserData(ContextNodeProperty);
329    }
330
331    /**
332     * Serialize a W3C document that was created by {@link #fromJsoup(org.jsoup.nodes.Element)} to a String.
333     * The output format will be XML or HTML depending on the content of the doc.
334     *
335     * @param doc Document
336     * @return Document as string
337     * @see W3CDom#asString(Document, Map)
338     */
339    public String asString(Document doc) {
340        return asString(doc, null);
341    }
342
343    /**
344     * Implements the conversion by walking the input.
345     */
346    protected static class W3CBuilder implements NodeVisitor {
347        private final Document doc;
348        private boolean namespaceAware = true;
349        private Node dest;
350        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
351        /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable?
352
353        public W3CBuilder(Document doc) {
354            this.doc = doc;
355            dest = doc;
356            contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
357        }
358
359        @Override
360        public void head(org.jsoup.nodes.Node source, int depth) {
361            if (source instanceof org.jsoup.nodes.Element) {
362                org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
363                String namespace = namespaceAware ? sourceEl.tag().namespace() : null;
364                String tagName = Normalizer.xmlSafeTagName(sourceEl.tagName());
365                try {
366                    // use an empty namespace if none is present but the tag name has a prefix
367                    String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
368                    Element el = doc.createElementNS(imputedNamespace, tagName);
369                    copyAttributes(sourceEl, el);
370                    append(el, sourceEl);
371                    if (sourceEl == contextElement)
372                        doc.setUserData(ContextNodeProperty, el, null);
373                    dest = el; // descend
374                } catch (DOMException e) {
375                    // If the Normalize didn't get it XML / W3C safe, inserts as plain text
376                    append(doc.createTextNode("<" + tagName + ">"), sourceEl);
377                }
378            } else if (source instanceof org.jsoup.nodes.TextNode) {
379                org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
380                Text text = doc.createTextNode(sourceText.getWholeText());
381                append(text, sourceText);
382            } else if (source instanceof org.jsoup.nodes.Comment) {
383                org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
384                Comment comment = doc.createComment(sourceComment.getData());
385                append(comment, sourceComment);
386            } else if (source instanceof org.jsoup.nodes.DataNode) {
387                org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
388                Text node = doc.createTextNode(sourceData.getWholeData());
389                append(node, sourceData);
390            } else {
391                // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation
392            }
393        }
394
395        private void append(Node append, org.jsoup.nodes.Node source) {
396            append.setUserData(SourceProperty, source, null);
397            dest.appendChild(append);
398        }
399
400        @Override
401        public void tail(org.jsoup.nodes.Node source, int depth) {
402            if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
403                dest = dest.getParentNode(); // undescend
404            }
405        }
406
407        private void copyAttributes(org.jsoup.nodes.Element jEl, Element wEl) {
408            for (Attribute attribute : jEl.attributes()) {
409                try {
410                    setAttribute(jEl, wEl, attribute, syntax);
411                } catch (DOMException e) {
412                    if (syntax != Syntax.xml)
413                        setAttribute(jEl, wEl, attribute, Syntax.xml);
414                }
415            }
416        }
417
418        private void setAttribute(org.jsoup.nodes.Element jEl, Element wEl, Attribute attribute, Syntax syntax) throws DOMException {
419            String key = Attribute.getValidKey(attribute.getKey(), syntax);
420            if (key != null) {
421                String namespace = attribute.namespace();
422                if (namespaceAware && !namespace.isEmpty())
423                    wEl.setAttributeNS(namespace, key, attribute.getValue());
424                else
425                    wEl.setAttribute(key, attribute.getValue());
426                maybeAddUndeclaredNs(namespace, key, jEl, wEl);
427            }
428        }
429
430        /**
431         Add a namespace declaration for an attribute with a prefix if it is not already present. Ensures that attributes
432         with prefixes have the corresponding namespace declared, E.g. attribute "v-bind:foo" gets another attribute
433         "xmlns:v-bind='undefined'. So that the asString() transformation pass is valid.
434         If the parser was HTML we don't have a discovered namespace but we are trying to coerce it, so walk up the
435         element stack and find it.
436         */
437        private void maybeAddUndeclaredNs(String namespace, String attrKey, org.jsoup.nodes.Element jEl, Element wEl) {
438            if (!namespaceAware || !namespace.isEmpty()) return;
439            int pos = attrKey.indexOf(':');
440            if (pos != -1) { // prefixed but no namespace defined during parse, add a fake so that w3c serialization doesn't blow up
441                String prefix = attrKey.substring(0, pos);
442                if (prefix.equals("xmlns")) return;
443                org.jsoup.nodes.Document doc = jEl.ownerDocument();
444                if (doc != null && doc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) {
445                    // try walking up the stack and seeing if there is a namespace declared for this prefix (and that we didn't parse because HTML)
446                    for (org.jsoup.nodes.Element el = jEl; el != null; el = el.parent()) {
447                        String ns = el.attr("xmlns:" + prefix);
448                        if (!ns.isEmpty()) {
449                            namespace = ns;
450                            // found it, set it
451                            wEl.setAttributeNS(namespace, attrKey, jEl.attr(attrKey));
452                            return;
453                        }
454                    }
455                }
456
457                // otherwise, put in a fake one
458                wEl.setAttribute("xmlns:" + prefix, undefinedNs);
459            }
460        }
461        private static final String undefinedNs = "undefined";
462    }
463
464}