001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceHtml;
020
021/**
022 A HTML Document.
023
024 @author Jonathan Hedley, jonathan@hedley.net */
025public class Document extends Element {
026    private @Nullable Connection connection; // the connection this doc was fetched from, if any
027    private OutputSettings outputSettings = new OutputSettings();
028    private Parser parser; // the parser used to parse this document
029    private QuirksMode quirksMode = QuirksMode.noQuirks;
030    private final String location;
031
032    /**
033     Create a new, empty Document, in the specified namespace.
034     @param namespace the namespace of this Document's root node.
035     @param baseUri base URI of document
036     @see org.jsoup.Jsoup#parse
037     @see #createShell
038     */
039    public Document(String namespace, String baseUri) {
040        this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable
041    }
042
043    private Document(String namespace, String baseUri, Parser parser) {
044        super(new Tag("#root", namespace), baseUri);
045        this.location = baseUri;
046        this.parser = parser;
047    }
048
049    /**
050     Create a new, empty Document, in the HTML namespace.
051     @param baseUri base URI of document
052     @see org.jsoup.Jsoup#parse
053     @see #Document(String namespace, String baseUri)
054     */
055    public Document(String baseUri) {
056        this(NamespaceHtml, baseUri);
057    }
058
059    /**
060     Create a valid, empty shell of an HTML document, suitable for adding more elements to.
061     @param baseUri baseUri of document
062     @return document with html, head, and body elements.
063     */
064    public static Document createShell(String baseUri) {
065        Validate.notNull(baseUri);
066
067        Document doc = new Document(baseUri);
068        Element html = doc.appendElement("html");
069        html.appendElement("head");
070        html.appendElement("body");
071
072        return doc;
073    }
074
075    /**
076     * Get the URL this Document was parsed from. If the starting URL is a redirect,
077     * this will return the final URL from which the document was served from.
078     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
079     * @return location
080     */
081    public String location() {
082        return location;
083    }
084
085    /**
086     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
087     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
088     @return the Connection (session) associated with this Document, or an empty one otherwise.
089     @see Connection#newRequest()
090     */
091    public Connection connection() {
092        if (connection == null)
093            return Jsoup.newSession();
094        else
095            return connection;
096    }
097
098    /**
099     * Returns this Document's doctype.
100     * @return document type, or null if not set
101     */
102    public @Nullable DocumentType documentType() {
103        for (Node node : childNodes) {
104            if (node instanceof DocumentType)
105                return (DocumentType) node;
106            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
107                break;
108        }
109        return null;
110    }
111
112    /**
113     Find the root HTML element, or create it if it doesn't exist.
114     @return the root HTML element.
115     */
116    private Element htmlEl() {
117        Element el = firstElementChild();
118        while (el != null) {
119            if (el.nameIs("html"))
120                return el;
121            el = el.nextElementSibling();
122        }
123        return appendElement("html");
124    }
125
126    /**
127     Get this document's {@code head} element.
128     <p>
129     As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want
130     that, use {@code #selectFirst("head")} instead.
131
132     @return {@code head} element.
133     */
134    public Element head() {
135        final Element html = htmlEl();
136        Element el = html.firstElementChild();
137        while (el != null) {
138            if (el.nameIs("head"))
139                return el;
140            el = el.nextElementSibling();
141        }
142        return html.prependElement("head");
143    }
144
145    /**
146     Get this document's {@code <body>} or {@code <frameset>} element.
147     <p>
148     As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code
149    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
150
151     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
152     had no contents, or the outermost {@code <frameset> element} for frameset documents.
153     */
154    public Element body() {
155        final Element html = htmlEl();
156        Element el = html.firstElementChild();
157        while (el != null) {
158            if (el.nameIs("body") || el.nameIs("frameset"))
159                return el;
160            el = el.nextElementSibling();
161        }
162        return html.appendElement("body");
163    }
164
165    /**
166     Get each of the {@code <form>} elements contained in this document.
167     @return a List of FormElement objects, which will be empty if there are none.
168     @see Elements#forms()
169     @see FormElement#elements()
170     @since 1.15.4
171     */
172    public List<FormElement> forms() {
173        return select("form").forms();
174    }
175
176    /**
177     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
178     {@link IllegalArgumentException}.
179     @param cssQuery a {@link Selector} CSS query
180     @return the first matching {@code <form>} element
181     @throws IllegalArgumentException if no match is found
182     @since 1.15.4
183     */
184    public FormElement expectForm(String cssQuery) {
185        Elements els = select(cssQuery);
186        for (Element el : els) {
187            if (el instanceof FormElement) return (FormElement) el;
188        }
189        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
190        return null; // (not really)
191    }
192
193    /**
194     Get the string contents of the document's {@code title} element.
195     @return Trimmed title, or empty string if none set.
196     */
197    public String title() {
198        // title is a preserve whitespace tag (for document output), but normalised here
199        Element titleEl = head().selectFirst(titleEval);
200        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
201    }
202    private static final Evaluator titleEval = new Evaluator.Tag("title");
203
204    /**
205     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
206     not present
207     @param title string to set as title
208     */
209    public void title(String title) {
210        Validate.notNull(title);
211        Element titleEl = head().selectFirst(titleEval);
212        if (titleEl == null) // add to head
213            titleEl = head().appendElement("title");
214        titleEl.text(title);
215    }
216
217    /**
218     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
219     @param tagName element tag name (e.g. {@code a})
220     @return new element
221     */
222    public Element createElement(String tagName) {
223        return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
224    }
225
226    @Override
227    public String outerHtml() {
228        return super.html(); // no outer wrapper tag
229    }
230
231    /**
232     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
233     @param text un-encoded text
234     @return this document
235     */
236    @Override
237    public Element text(String text) {
238        body().text(text); // overridden to not nuke doc structure
239        return this;
240    }
241
242    @Override
243    public String nodeName() {
244        return "#document";
245    }
246
247    /**
248     Set the output character set of this Document. This method is equivalent to
249     {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or
250     updates the charset / encoding element within the Document.
251
252     <p>If there's no existing element with charset / encoding information yet, one will
253     be created. Obsolete charset / encoding definitions are removed.</p>
254
255     <p><b>Elements used:</b></p>
256
257     <ul>
258     <li><b>HTML:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
259     <li><b>XML:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
260     </ul>
261
262     @param charset Charset
263     @see OutputSettings#charset(java.nio.charset.Charset)
264     */
265    public void charset(Charset charset) {
266        outputSettings.charset(charset);
267        ensureMetaCharsetElement();
268    }
269
270    /**
271     Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}.
272
273     @return the current Charset
274     @see OutputSettings#charset()
275     */
276    public Charset charset() {
277        return outputSettings.charset();
278    }
279
280    @Override
281    public Document clone() {
282        Document clone = (Document) super.clone();
283        if (attributes != null) clone.attributes = attributes.clone();
284        clone.outputSettings = this.outputSettings.clone();
285        // parser is pointer copy
286        return clone;
287    }
288
289    @Override
290    public Document shallowClone() {
291        Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer
292        if (attributes != null) clone.attributes = attributes.clone();
293        clone.outputSettings = this.outputSettings.clone();
294        return clone;
295    }
296    
297
298    private void ensureMetaCharsetElement() {
299        OutputSettings.Syntax syntax = outputSettings().syntax();
300
301        if (syntax == OutputSettings.Syntax.html) {
302            Element metaCharset = selectFirst("meta[charset]");
303            if (metaCharset != null) {
304                metaCharset.attr("charset", charset().displayName());
305            } else {
306                head().appendElement("meta").attr("charset", charset().displayName());
307            }
308            select("meta[name=charset]").remove(); // Remove obsolete elements
309        } else if (syntax == OutputSettings.Syntax.xml) {
310            XmlDeclaration decl = ensureXmlDecl();
311            decl.attr("version", "1.0");
312            decl.attr("encoding", charset().displayName());
313        }
314    }
315
316    private XmlDeclaration ensureXmlDecl() {
317        Node node = firstChild();
318        if (node instanceof XmlDeclaration) {
319            XmlDeclaration decl = (XmlDeclaration) node;
320            if (decl.name().equals("xml")) return decl;
321        }
322        XmlDeclaration decl = new XmlDeclaration("xml", false);
323        prependChild(decl);
324        return decl;
325    }
326
327
328    /**
329     * A Document's output settings control the form of the text() and html() methods.
330     */
331    public static class OutputSettings implements Cloneable {
332        /**
333         * The output serialization syntax.
334         */
335        public enum Syntax {html, xml}
336        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
337        private Charset charset = DataUtil.UTF_8;
338        private boolean prettyPrint = true;
339        private boolean outline = false;
340        private int indentAmount = 1;
341        private int maxPaddingWidth = 30;
342        private Syntax syntax = Syntax.html;
343
344        /**
345         Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
346         indent amount of 1).
347         */
348        public OutputSettings() {
349        }
350
351        /**
352         Get the document's current entity escape mode:
353         <ul>
354         <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
355         <li><code>base</code>, which provides a limited set of named HTML
356         entities and escapes other characters as numbered entities for maximum compatibility</li>
357         <li><code>extended</code>,
358         which uses the complete set of HTML named entities.</li>
359         </ul>
360         <p>The default escape mode is <code>base</code>.
361         @return the document's current escape mode
362         */
363        public Entities.EscapeMode escapeMode() {
364            return escapeMode;
365        }
366
367        /**
368         * Set the document's escape mode, which determines how characters are escaped when the output character set
369         * does not support a given character:- using either a named or a numbered escape.
370         * @param escapeMode the new escape mode to use
371         * @return the document's output settings, for chaining
372         */
373        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
374            this.escapeMode = escapeMode;
375            return this;
376        }
377
378        /**
379         * Get the document's current output charset, which is used to control which characters are escaped when
380         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
381         * <p>
382         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
383         * input charset. Otherwise, it defaults to UTF-8.
384         * @return the document's current charset.
385         */
386        public Charset charset() {
387            return charset;
388        }
389
390        /**
391         * Update the document's output charset.
392         * @param charset the new charset to use.
393         * @return the document's output settings, for chaining
394         */
395        public OutputSettings charset(Charset charset) {
396            this.charset = charset;
397            return this;
398        }
399
400        /**
401         * Update the document's output charset.
402         * @param charset the new charset (by name) to use.
403         * @return the document's output settings, for chaining
404         */
405        public OutputSettings charset(String charset) {
406            charset(Charset.forName(charset));
407            return this;
408        }
409
410        /**
411         * Get the document's current output syntax.
412         * @return current syntax
413         */
414        public Syntax syntax() {
415            return syntax;
416        }
417
418        /**
419         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
420         * {@code xml}, with self-closing tags.
421         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
422         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
423         * @param syntax serialization syntax
424         * @return the document's output settings, for chaining
425         */
426        public OutputSettings syntax(Syntax syntax) {
427            this.syntax = syntax;
428            if (syntax == Syntax.xml)
429                this.escapeMode(Entities.EscapeMode.xhtml);
430            return this;
431        }
432
433        /**
434         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
435         * the output, and the output will generally look like the input.
436         * @return if pretty printing is enabled.
437         */
438        public boolean prettyPrint() {
439            return prettyPrint;
440        }
441
442        /**
443         * Enable or disable pretty printing.
444         * @param pretty new pretty print setting
445         * @return this, for chaining
446         */
447        public OutputSettings prettyPrint(boolean pretty) {
448            prettyPrint = pretty;
449            return this;
450        }
451        
452        /**
453         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
454         * all tags as block.
455         * @return if outline mode is enabled.
456         */
457        public boolean outline() {
458            return outline;
459        }
460        
461        /**
462         * Enable or disable HTML outline mode.
463         * @param outlineMode new outline setting
464         * @return this, for chaining
465         */
466        public OutputSettings outline(boolean outlineMode) {
467            outline = outlineMode;
468            return this;
469        }
470
471        /**
472         * Get the current tag indent amount, used when pretty printing.
473         * @return the current indent amount
474         */
475        public int indentAmount() {
476            return indentAmount;
477        }
478
479        /**
480         * Set the indent amount for pretty printing
481         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
482         * @return this, for chaining
483         */
484        public OutputSettings indentAmount(int indentAmount) {
485            Validate.isTrue(indentAmount >= 0);
486            this.indentAmount = indentAmount;
487            return this;
488        }
489
490        /**
491         * Get the current max padding amount, used when pretty printing
492         * so very deeply nested nodes don't get insane padding amounts.
493         * @return the current indent amount
494         */
495        public int maxPaddingWidth() {
496            return maxPaddingWidth;
497        }
498
499        /**
500         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
501         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
502         *        Default is 30 and -1 means unlimited.
503         * @return this, for chaining
504         */
505        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
506            Validate.isTrue(maxPaddingWidth >= -1);
507            this.maxPaddingWidth = maxPaddingWidth;
508            return this;
509        }
510
511        @Override
512        public OutputSettings clone() {
513            OutputSettings clone;
514            try {
515                clone = (OutputSettings) super.clone();
516            } catch (CloneNotSupportedException e) {
517                throw new RuntimeException(e);
518            }
519            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
520            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
521            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
522            return clone;
523        }
524    }
525
526    /**
527     * Get the document's current output settings.
528     * @return the document's current output settings.
529     */
530    public OutputSettings outputSettings() {
531        return outputSettings;
532    }
533
534    /**
535     * Set the document's output settings.
536     * @param outputSettings new output settings.
537     * @return this document, for chaining.
538     */
539    public Document outputSettings(OutputSettings outputSettings) {
540        Validate.notNull(outputSettings);
541        this.outputSettings = outputSettings;
542        return this;
543    }
544
545    public enum QuirksMode {
546        noQuirks, quirks, limitedQuirks
547    }
548
549    public QuirksMode quirksMode() {
550        return quirksMode;
551    }
552
553    public Document quirksMode(QuirksMode quirksMode) {
554        this.quirksMode = quirksMode;
555        return this;
556    }
557
558    /**
559     * Get the parser that was used to parse this document.
560     * @return the parser
561     */
562    public Parser parser() {
563        return parser;
564    }
565
566    /**
567     * Set the parser used to create this document. This parser is then used when further parsing within this document
568     * is required.
569     * @param parser the configured parser to use when further parsing is required for this document.
570     * @return this document, for chaining.
571     */
572    public Document parser(Parser parser) {
573        this.parser = parser;
574        return this;
575    }
576
577    /**
578     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
579     made (e.g. when a form is submitted).
580
581     @param connection to set
582     @return this document, for chaining
583     @see Connection#newRequest()
584     @since 1.14.1
585     */
586    public Document connection(Connection connection) {
587        Validate.notNull(connection);
588        this.connection = connection;
589        return this;
590    }
591}