001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceHtml;
020
021/**
022 A HTML Document.
023
024 @author Jonathan Hedley, jonathan@hedley.net */
025public class Document extends Element {
026    private @Nullable Connection connection; // the connection this doc was fetched from, if any
027    private OutputSettings outputSettings = new OutputSettings();
028    private Parser parser; // the parser used to parse this document
029    private QuirksMode quirksMode = QuirksMode.noQuirks;
030    private final String location;
031
032    /**
033     Create a new, empty Document, in the specified namespace.
034     @param namespace the namespace of this Document's root node.
035     @param baseUri base URI of document
036     @see org.jsoup.Jsoup#parse
037     @see #createShell
038     */
039    public Document(String namespace, String baseUri) {
040        this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable
041    }
042
043    private Document(String namespace, String baseUri, Parser parser) {
044        super(new Tag("#root", namespace), baseUri);
045        this.location = baseUri;
046        this.parser = parser;
047    }
048
049    /**
050     Create a new, empty Document, in the HTML namespace.
051     @param baseUri base URI of document
052     @see org.jsoup.Jsoup#parse
053     @see #Document(String namespace, String baseUri)
054     */
055    public Document(String baseUri) {
056        this(NamespaceHtml, baseUri);
057    }
058
059    /**
060     Create a valid, empty shell of an HTML document, suitable for adding more elements to.
061     @param baseUri baseUri of document
062     @return document with html, head, and body elements.
063     */
064    public static Document createShell(String baseUri) {
065        Validate.notNull(baseUri);
066
067        Document doc = new Document(baseUri);
068        Element html = doc.appendElement("html");
069        html.appendElement("head");
070        html.appendElement("body");
071
072        return doc;
073    }
074
075    /**
076     * Get the URL this Document was parsed from. If the starting URL is a redirect,
077     * this will return the final URL from which the document was served from.
078     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
079     * @return location
080     */
081    public String location() {
082        return location;
083    }
084
085    /**
086     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
087     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
088     @return the Connection (session) associated with this Document, or an empty one otherwise.
089     @see Connection#newRequest()
090     */
091    public Connection connection() {
092        if (connection == null)
093            return Jsoup.newSession();
094        else
095            return connection;
096    }
097
098    /**
099     * Returns this Document's doctype.
100     * @return document type, or null if not set
101     */
102    public @Nullable DocumentType documentType() {
103        for (Node node : childNodes) {
104            if (node instanceof DocumentType)
105                return (DocumentType) node;
106            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
107                break;
108        }
109        return null;
110        // todo - add a set document type?
111    }
112
113    /**
114     Find the root HTML element, or create it if it doesn't exist.
115     @return the root HTML element.
116     */
117    private Element htmlEl() {
118        Element el = firstElementChild();
119        while (el != null) {
120            if (el.nameIs("html"))
121                return el;
122            el = el.nextElementSibling();
123        }
124        return appendElement("html");
125    }
126
127    /**
128     Get this document's {@code head} element.
129     <p>
130     As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want
131     that, use {@code #selectFirst("head")} instead.
132
133     @return {@code head} element.
134     */
135    public Element head() {
136        final Element html = htmlEl();
137        Element el = html.firstElementChild();
138        while (el != null) {
139            if (el.nameIs("head"))
140                return el;
141            el = el.nextElementSibling();
142        }
143        return html.prependElement("head");
144    }
145
146    /**
147     Get this document's {@code <body>} or {@code <frameset>} element.
148     <p>
149     As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code
150    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
151
152     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
153     had no contents, or the outermost {@code <frameset> element} for frameset documents.
154     */
155    public Element body() {
156        final Element html = htmlEl();
157        Element el = html.firstElementChild();
158        while (el != null) {
159            if (el.nameIs("body") || el.nameIs("frameset"))
160                return el;
161            el = el.nextElementSibling();
162        }
163        return html.appendElement("body");
164    }
165
166    /**
167     Get each of the {@code <form>} elements contained in this document.
168     @return a List of FormElement objects, which will be empty if there are none.
169     @see Elements#forms()
170     @see FormElement#elements()
171     @since 1.15.4
172     */
173    public List<FormElement> forms() {
174        return select("form").forms();
175    }
176
177    /**
178     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
179     {@link IllegalArgumentException}.
180     @param cssQuery a {@link Selector} CSS query
181     @return the first matching {@code <form>} element
182     @throws IllegalArgumentException if no match is found
183     @since 1.15.4
184     */
185    public FormElement expectForm(String cssQuery) {
186        Elements els = select(cssQuery);
187        for (Element el : els) {
188            if (el instanceof FormElement) return (FormElement) el;
189        }
190        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
191        return null; // (not really)
192    }
193
194    /**
195     Get the string contents of the document's {@code title} element.
196     @return Trimmed title, or empty string if none set.
197     */
198    public String title() {
199        // title is a preserve whitespace tag (for document output), but normalised here
200        Element titleEl = head().selectFirst(titleEval);
201        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
202    }
203    private static final Evaluator titleEval = new Evaluator.Tag("title");
204
205    /**
206     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
207     not present
208     @param title string to set as title
209     */
210    public void title(String title) {
211        Validate.notNull(title);
212        Element titleEl = head().selectFirst(titleEval);
213        if (titleEl == null) // add to head
214            titleEl = head().appendElement("title");
215        titleEl.text(title);
216    }
217
218    /**
219     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
220     @param tagName element tag name (e.g. {@code a})
221     @return new element
222     */
223    public Element createElement(String tagName) {
224        return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
225    }
226
227    @Override
228    public String outerHtml() {
229        return super.html(); // no outer wrapper tag
230    }
231
232    /**
233     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
234     @param text un-encoded text
235     @return this document
236     */
237    @Override
238    public Element text(String text) {
239        body().text(text); // overridden to not nuke doc structure
240        return this;
241    }
242
243    @Override
244    public String nodeName() {
245        return "#document";
246    }
247
248    /**
249     Set the output character set of this Document. This method is equivalent to
250     {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or
251     updates the charset / encoding element within the Document.
252
253     <p>If there's no existing element with charset / encoding information yet, one will
254     be created. Obsolete charset / encoding definitions are removed.</p>
255
256     <p><b>Elements used:</b></p>
257
258     <ul>
259     <li><b>HTML:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
260     <li><b>XML:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
261     </ul>
262
263     @param charset Charset
264     @see OutputSettings#charset(java.nio.charset.Charset)
265     */
266    public void charset(Charset charset) {
267        outputSettings.charset(charset);
268        ensureMetaCharsetElement();
269    }
270
271    /**
272     Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}.
273
274     @return the current Charset
275     @see OutputSettings#charset()
276     */
277    public Charset charset() {
278        return outputSettings.charset();
279    }
280
281    /**
282     @deprecated this setting has no effect; the meta charset element is always updated when
283     {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1.
284     */
285    @Deprecated
286    public void updateMetaCharsetElement(boolean noop) {}
287
288    /**
289     @deprecated this setting has no effect; the meta charset element is always updated when
290     {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1.
291     */
292    @Deprecated
293    public boolean updateMetaCharsetElement() {
294        return true;
295    }
296
297    @Override
298    public Document clone() {
299        Document clone = (Document) super.clone();
300        if (attributes != null) clone.attributes = attributes.clone();
301        clone.outputSettings = this.outputSettings.clone();
302        // parser is pointer copy
303        return clone;
304    }
305
306    @Override
307    public Document shallowClone() {
308        Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer
309        if (attributes != null) clone.attributes = attributes.clone();
310        clone.outputSettings = this.outputSettings.clone();
311        return clone;
312    }
313    
314
315    private void ensureMetaCharsetElement() {
316        OutputSettings.Syntax syntax = outputSettings().syntax();
317
318        if (syntax == OutputSettings.Syntax.html) {
319            Element metaCharset = selectFirst("meta[charset]");
320            if (metaCharset != null) {
321                metaCharset.attr("charset", charset().displayName());
322            } else {
323                head().appendElement("meta").attr("charset", charset().displayName());
324            }
325            select("meta[name=charset]").remove(); // Remove obsolete elements
326        } else if (syntax == OutputSettings.Syntax.xml) {
327            XmlDeclaration decl = ensureXmlDecl();
328            decl.attr("version", "1.0");
329            decl.attr("encoding", charset().displayName());
330        }
331    }
332
333    private XmlDeclaration ensureXmlDecl() {
334        Node node = firstChild();
335        if (node instanceof XmlDeclaration) {
336            XmlDeclaration decl = (XmlDeclaration) node;
337            if (decl.name().equals("xml")) return decl;
338        }
339        XmlDeclaration decl = new XmlDeclaration("xml", false);
340        prependChild(decl);
341        return decl;
342    }
343
344
345    /**
346     * A Document's output settings control the form of the text() and html() methods.
347     */
348    public static class OutputSettings implements Cloneable {
349        /**
350         * The output serialization syntax.
351         */
352        public enum Syntax {html, xml}
353        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
354        private Charset charset = DataUtil.UTF_8;
355        private boolean prettyPrint = true;
356        private boolean outline = false;
357        private int indentAmount = 1;
358        private int maxPaddingWidth = 30;
359        private Syntax syntax = Syntax.html;
360
361        /**
362         Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
363         indent amount of 1).
364         */
365        public OutputSettings() {
366        }
367
368        /**
369         Get the document's current entity escape mode:
370         <ul>
371         <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
372         <li><code>base</code>, which provides a limited set of named HTML
373         entities and escapes other characters as numbered entities for maximum compatibility</li>
374         <li><code>extended</code>,
375         which uses the complete set of HTML named entities.</li>
376         </ul>
377         <p>The default escape mode is <code>base</code>.
378         @return the document's current escape mode
379         */
380        public Entities.EscapeMode escapeMode() {
381            return escapeMode;
382        }
383
384        /**
385         * Set the document's escape mode, which determines how characters are escaped when the output character set
386         * does not support a given character:- using either a named or a numbered escape.
387         * @param escapeMode the new escape mode to use
388         * @return the document's output settings, for chaining
389         */
390        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
391            this.escapeMode = escapeMode;
392            return this;
393        }
394
395        /**
396         * Get the document's current output charset, which is used to control which characters are escaped when
397         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
398         * <p>
399         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
400         * input charset. Otherwise, it defaults to UTF-8.
401         * @return the document's current charset.
402         */
403        public Charset charset() {
404            return charset;
405        }
406
407        /**
408         * Update the document's output charset.
409         * @param charset the new charset to use.
410         * @return the document's output settings, for chaining
411         */
412        public OutputSettings charset(Charset charset) {
413            this.charset = charset;
414            return this;
415        }
416
417        /**
418         * Update the document's output charset.
419         * @param charset the new charset (by name) to use.
420         * @return the document's output settings, for chaining
421         */
422        public OutputSettings charset(String charset) {
423            charset(Charset.forName(charset));
424            return this;
425        }
426
427        /**
428         * Get the document's current output syntax.
429         * @return current syntax
430         */
431        public Syntax syntax() {
432            return syntax;
433        }
434
435        /**
436         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
437         * {@code xml}, with self-closing tags.
438         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
439         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
440         * @param syntax serialization syntax
441         * @return the document's output settings, for chaining
442         */
443        public OutputSettings syntax(Syntax syntax) {
444            this.syntax = syntax;
445            if (syntax == Syntax.xml)
446                this.escapeMode(Entities.EscapeMode.xhtml);
447            return this;
448        }
449
450        /**
451         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
452         * the output, and the output will generally look like the input.
453         * @return if pretty printing is enabled.
454         */
455        public boolean prettyPrint() {
456            return prettyPrint;
457        }
458
459        /**
460         * Enable or disable pretty printing.
461         * @param pretty new pretty print setting
462         * @return this, for chaining
463         */
464        public OutputSettings prettyPrint(boolean pretty) {
465            prettyPrint = pretty;
466            return this;
467        }
468        
469        /**
470         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
471         * all tags as block.
472         * @return if outline mode is enabled.
473         */
474        public boolean outline() {
475            return outline;
476        }
477        
478        /**
479         * Enable or disable HTML outline mode.
480         * @param outlineMode new outline setting
481         * @return this, for chaining
482         */
483        public OutputSettings outline(boolean outlineMode) {
484            outline = outlineMode;
485            return this;
486        }
487
488        /**
489         * Get the current tag indent amount, used when pretty printing.
490         * @return the current indent amount
491         */
492        public int indentAmount() {
493            return indentAmount;
494        }
495
496        /**
497         * Set the indent amount for pretty printing
498         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
499         * @return this, for chaining
500         */
501        public OutputSettings indentAmount(int indentAmount) {
502            Validate.isTrue(indentAmount >= 0);
503            this.indentAmount = indentAmount;
504            return this;
505        }
506
507        /**
508         * Get the current max padding amount, used when pretty printing
509         * so very deeply nested nodes don't get insane padding amounts.
510         * @return the current indent amount
511         */
512        public int maxPaddingWidth() {
513            return maxPaddingWidth;
514        }
515
516        /**
517         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
518         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
519         *        Default is 30 and -1 means unlimited.
520         * @return this, for chaining
521         */
522        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
523            Validate.isTrue(maxPaddingWidth >= -1);
524            this.maxPaddingWidth = maxPaddingWidth;
525            return this;
526        }
527
528        @Override
529        public OutputSettings clone() {
530            OutputSettings clone;
531            try {
532                clone = (OutputSettings) super.clone();
533            } catch (CloneNotSupportedException e) {
534                throw new RuntimeException(e);
535            }
536            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
537            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
538            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
539            return clone;
540        }
541    }
542
543    /**
544     * Get the document's current output settings.
545     * @return the document's current output settings.
546     */
547    public OutputSettings outputSettings() {
548        return outputSettings;
549    }
550
551    /**
552     * Set the document's output settings.
553     * @param outputSettings new output settings.
554     * @return this document, for chaining.
555     */
556    public Document outputSettings(OutputSettings outputSettings) {
557        Validate.notNull(outputSettings);
558        this.outputSettings = outputSettings;
559        return this;
560    }
561
562    public enum QuirksMode {
563        noQuirks, quirks, limitedQuirks
564    }
565
566    public QuirksMode quirksMode() {
567        return quirksMode;
568    }
569
570    public Document quirksMode(QuirksMode quirksMode) {
571        this.quirksMode = quirksMode;
572        return this;
573    }
574
575    /**
576     * Get the parser that was used to parse this document.
577     * @return the parser
578     */
579    public Parser parser() {
580        return parser;
581    }
582
583    /**
584     * Set the parser used to create this document. This parser is then used when further parsing within this document
585     * is required.
586     * @param parser the configured parser to use when further parsing is required for this document.
587     * @return this document, for chaining.
588     */
589    public Document parser(Parser parser) {
590        this.parser = parser;
591        return this;
592    }
593
594    /**
595     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
596     made (e.g. when a form is submitted).
597
598     @param connection to set
599     @return this document, for chaining
600     @see Connection#newRequest()
601     @since 1.14.1
602     */
603    public Document connection(Connection connection) {
604        Validate.notNull(connection);
605        this.connection = connection;
606        return this;
607    }
608}