001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceHtml;
020
021/**
022 A HTML Document.
023
024 @author Jonathan Hedley, jonathan@hedley.net */
025public class Document extends Element {
026    private @Nullable Connection connection; // the connection this doc was fetched from, if any
027    private OutputSettings outputSettings = new OutputSettings();
028    private Parser parser; // the parser used to parse this document
029    private QuirksMode quirksMode = QuirksMode.noQuirks;
030    private final String location;
031
032    /**
033     Create a new, empty Document, in the specified namespace.
034     @param namespace the namespace of this Document's root node.
035     @param baseUri base URI of document
036     @see org.jsoup.Jsoup#parse
037     @see #createShell
038     */
039    public Document(String namespace, String baseUri) {
040        super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
041        this.location = baseUri;
042        this.parser = Parser.htmlParser(); // default, but overridable
043    }
044
045    /**
046     Create a new, empty Document, in the HTML namespace.
047     @param baseUri base URI of document
048     @see org.jsoup.Jsoup#parse
049     @see #Document(String namespace, String baseUri)
050     */
051    public Document(String baseUri) {
052        this(NamespaceHtml, baseUri);
053    }
054
055    /**
056     Create a valid, empty shell of an HTML document, suitable for adding more elements to.
057     @param baseUri baseUri of document
058     @return document with html, head, and body elements.
059     */
060    public static Document createShell(String baseUri) {
061        Validate.notNull(baseUri);
062
063        Document doc = new Document(baseUri);
064        Element html = doc.appendElement("html");
065        html.appendElement("head");
066        html.appendElement("body");
067
068        return doc;
069    }
070
071    /**
072     * Get the URL this Document was parsed from. If the starting URL is a redirect,
073     * this will return the final URL from which the document was served from.
074     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
075     * @return location
076     */
077    public String location() {
078        return location;
079    }
080
081    /**
082     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
083     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
084     @return the Connection (session) associated with this Document, or an empty one otherwise.
085     @see Connection#newRequest()
086     */
087    public Connection connection() {
088        if (connection == null)
089            return Jsoup.newSession();
090        else
091            return connection;
092    }
093
094    /**
095     * Returns this Document's doctype.
096     * @return document type, or null if not set
097     */
098    public @Nullable DocumentType documentType() {
099        for (Node node : childNodes) {
100            if (node instanceof DocumentType)
101                return (DocumentType) node;
102            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
103                break;
104        }
105        return null;
106        // todo - add a set document type?
107    }
108
109    /**
110     Find the root HTML element, or create it if it doesn't exist.
111     @return the root HTML element.
112     */
113    private Element htmlEl() {
114        Element el = firstElementChild();
115        while (el != null) {
116            if (el.nameIs("html"))
117                return el;
118            el = el.nextElementSibling();
119        }
120        return appendElement("html");
121    }
122
123    /**
124     Get this document's {@code head} element.
125     <p>
126     As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want
127     that, use {@code #selectFirst("head")} instead.
128
129     @return {@code head} element.
130     */
131    public Element head() {
132        final Element html = htmlEl();
133        Element el = html.firstElementChild();
134        while (el != null) {
135            if (el.nameIs("head"))
136                return el;
137            el = el.nextElementSibling();
138        }
139        return html.prependElement("head");
140    }
141
142    /**
143     Get this document's {@code <body>} or {@code <frameset>} element.
144     <p>
145     As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code
146    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
147
148     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
149     had no contents, or the outermost {@code <frameset> element} for frameset documents.
150     */
151    public Element body() {
152        final Element html = htmlEl();
153        Element el = html.firstElementChild();
154        while (el != null) {
155            if (el.nameIs("body") || el.nameIs("frameset"))
156                return el;
157            el = el.nextElementSibling();
158        }
159        return html.appendElement("body");
160    }
161
162    /**
163     Get each of the {@code <form>} elements contained in this document.
164     @return a List of FormElement objects, which will be empty if there are none.
165     @see Elements#forms()
166     @see FormElement#elements()
167     @since 1.15.4
168     */
169    public List<FormElement> forms() {
170        return select("form").forms();
171    }
172
173    /**
174     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
175     {@link IllegalArgumentException}.
176     @param cssQuery a {@link Selector} CSS query
177     @return the first matching {@code <form>} element
178     @throws IllegalArgumentException if no match is found
179     @since 1.15.4
180     */
181    public FormElement expectForm(String cssQuery) {
182        Elements els = select(cssQuery);
183        for (Element el : els) {
184            if (el instanceof FormElement) return (FormElement) el;
185        }
186        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
187        return null; // (not really)
188    }
189
190    /**
191     Get the string contents of the document's {@code title} element.
192     @return Trimmed title, or empty string if none set.
193     */
194    public String title() {
195        // title is a preserve whitespace tag (for document output), but normalised here
196        Element titleEl = head().selectFirst(titleEval);
197        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
198    }
199    private static final Evaluator titleEval = new Evaluator.Tag("title");
200
201    /**
202     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
203     not present
204     @param title string to set as title
205     */
206    public void title(String title) {
207        Validate.notNull(title);
208        Element titleEl = head().selectFirst(titleEval);
209        if (titleEl == null) // add to head
210            titleEl = head().appendElement("title");
211        titleEl.text(title);
212    }
213
214    /**
215     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
216     @param tagName element tag name (e.g. {@code a})
217     @return new element
218     */
219    public Element createElement(String tagName) {
220        return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
221    }
222
223    @Override
224    public String outerHtml() {
225        return super.html(); // no outer wrapper tag
226    }
227
228    /**
229     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
230     @param text un-encoded text
231     @return this document
232     */
233    @Override
234    public Element text(String text) {
235        body().text(text); // overridden to not nuke doc structure
236        return this;
237    }
238
239    @Override
240    public String nodeName() {
241        return "#document";
242    }
243
244    /**
245     Set the output character set of this Document. This method is equivalent to
246     {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or
247     updates the charset / encoding element within the Document.
248
249     <p>If there's no existing element with charset / encoding information yet, one will
250     be created. Obsolete charset / encoding definitions are removed.</p>
251
252     <p><b>Elements used:</b></p>
253
254     <ul>
255     <li><b>HTML:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
256     <li><b>XML:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
257     </ul>
258
259     @param charset Charset
260     @see OutputSettings#charset(java.nio.charset.Charset)
261     */
262    public void charset(Charset charset) {
263        outputSettings.charset(charset);
264        ensureMetaCharsetElement();
265    }
266
267    /**
268     Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}.
269
270     @return the current Charset
271     @see OutputSettings#charset()
272     */
273    public Charset charset() {
274        return outputSettings.charset();
275    }
276
277    /**
278     @deprecated this setting has no effect; the meta charset element is always updated when
279     {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1.
280     */
281    @Deprecated
282    public void updateMetaCharsetElement(boolean noop) {}
283
284    /**
285     @deprecated this setting has no effect; the meta charset element is always updated when
286     {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1.
287     */
288    @Deprecated
289    public boolean updateMetaCharsetElement() {
290        return true;
291    }
292
293    @Override
294    public Document clone() {
295        Document clone = (Document) super.clone();
296        clone.outputSettings = this.outputSettings.clone();
297        clone.parser = this.parser.clone();
298        return clone;
299    }
300
301    @Override
302    public Document shallowClone() {
303        Document clone = new Document(this.tag().namespace(), baseUri());
304        if (attributes != null)
305            clone.attributes = attributes.clone();
306        clone.outputSettings = this.outputSettings.clone();
307        return clone;
308    }
309    
310
311    private void ensureMetaCharsetElement() {
312        OutputSettings.Syntax syntax = outputSettings().syntax();
313
314        if (syntax == OutputSettings.Syntax.html) {
315            Element metaCharset = selectFirst("meta[charset]");
316            if (metaCharset != null) {
317                metaCharset.attr("charset", charset().displayName());
318            } else {
319                head().appendElement("meta").attr("charset", charset().displayName());
320            }
321            select("meta[name=charset]").remove(); // Remove obsolete elements
322        } else if (syntax == OutputSettings.Syntax.xml) {
323            XmlDeclaration decl = ensureXmlDecl();
324            decl.attr("version", "1.0");
325            decl.attr("encoding", charset().displayName());
326        }
327    }
328
329    private XmlDeclaration ensureXmlDecl() {
330        Node node = firstChild();
331        if (node instanceof XmlDeclaration) {
332            XmlDeclaration decl = (XmlDeclaration) node;
333            if (decl.name().equals("xml")) return decl;
334        }
335        XmlDeclaration decl = new XmlDeclaration("xml", false);
336        prependChild(decl);
337        return decl;
338    }
339
340
341    /**
342     * A Document's output settings control the form of the text() and html() methods.
343     */
344    public static class OutputSettings implements Cloneable {
345        /**
346         * The output serialization syntax.
347         */
348        public enum Syntax {html, xml}
349        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
350        private Charset charset = DataUtil.UTF_8;
351        private boolean prettyPrint = true;
352        private boolean outline = false;
353        private int indentAmount = 1;
354        private int maxPaddingWidth = 30;
355        private Syntax syntax = Syntax.html;
356
357        /**
358         Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
359         indent amount of 1).
360         */
361        public OutputSettings() {
362        }
363
364        /**
365         Get the document's current entity escape mode:
366         <ul>
367         <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
368         <li><code>base</code>, which provides a limited set of named HTML
369         entities and escapes other characters as numbered entities for maximum compatibility</li>
370         <li><code>extended</code>,
371         which uses the complete set of HTML named entities.</li>
372         </ul>
373         <p>The default escape mode is <code>base</code>.
374         @return the document's current escape mode
375         */
376        public Entities.EscapeMode escapeMode() {
377            return escapeMode;
378        }
379
380        /**
381         * Set the document's escape mode, which determines how characters are escaped when the output character set
382         * does not support a given character:- using either a named or a numbered escape.
383         * @param escapeMode the new escape mode to use
384         * @return the document's output settings, for chaining
385         */
386        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
387            this.escapeMode = escapeMode;
388            return this;
389        }
390
391        /**
392         * Get the document's current output charset, which is used to control which characters are escaped when
393         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
394         * <p>
395         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
396         * input charset. Otherwise, it defaults to UTF-8.
397         * @return the document's current charset.
398         */
399        public Charset charset() {
400            return charset;
401        }
402
403        /**
404         * Update the document's output charset.
405         * @param charset the new charset to use.
406         * @return the document's output settings, for chaining
407         */
408        public OutputSettings charset(Charset charset) {
409            this.charset = charset;
410            return this;
411        }
412
413        /**
414         * Update the document's output charset.
415         * @param charset the new charset (by name) to use.
416         * @return the document's output settings, for chaining
417         */
418        public OutputSettings charset(String charset) {
419            charset(Charset.forName(charset));
420            return this;
421        }
422
423        /**
424         * Get the document's current output syntax.
425         * @return current syntax
426         */
427        public Syntax syntax() {
428            return syntax;
429        }
430
431        /**
432         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
433         * {@code xml}, with self-closing tags.
434         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
435         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
436         * @param syntax serialization syntax
437         * @return the document's output settings, for chaining
438         */
439        public OutputSettings syntax(Syntax syntax) {
440            this.syntax = syntax;
441            if (syntax == Syntax.xml)
442                this.escapeMode(Entities.EscapeMode.xhtml);
443            return this;
444        }
445
446        /**
447         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
448         * the output, and the output will generally look like the input.
449         * @return if pretty printing is enabled.
450         */
451        public boolean prettyPrint() {
452            return prettyPrint;
453        }
454
455        /**
456         * Enable or disable pretty printing.
457         * @param pretty new pretty print setting
458         * @return this, for chaining
459         */
460        public OutputSettings prettyPrint(boolean pretty) {
461            prettyPrint = pretty;
462            return this;
463        }
464        
465        /**
466         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
467         * all tags as block.
468         * @return if outline mode is enabled.
469         */
470        public boolean outline() {
471            return outline;
472        }
473        
474        /**
475         * Enable or disable HTML outline mode.
476         * @param outlineMode new outline setting
477         * @return this, for chaining
478         */
479        public OutputSettings outline(boolean outlineMode) {
480            outline = outlineMode;
481            return this;
482        }
483
484        /**
485         * Get the current tag indent amount, used when pretty printing.
486         * @return the current indent amount
487         */
488        public int indentAmount() {
489            return indentAmount;
490        }
491
492        /**
493         * Set the indent amount for pretty printing
494         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
495         * @return this, for chaining
496         */
497        public OutputSettings indentAmount(int indentAmount) {
498            Validate.isTrue(indentAmount >= 0);
499            this.indentAmount = indentAmount;
500            return this;
501        }
502
503        /**
504         * Get the current max padding amount, used when pretty printing
505         * so very deeply nested nodes don't get insane padding amounts.
506         * @return the current indent amount
507         */
508        public int maxPaddingWidth() {
509            return maxPaddingWidth;
510        }
511
512        /**
513         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
514         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
515         *        Default is 30 and -1 means unlimited.
516         * @return this, for chaining
517         */
518        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
519            Validate.isTrue(maxPaddingWidth >= -1);
520            this.maxPaddingWidth = maxPaddingWidth;
521            return this;
522        }
523
524        @Override
525        public OutputSettings clone() {
526            OutputSettings clone;
527            try {
528                clone = (OutputSettings) super.clone();
529            } catch (CloneNotSupportedException e) {
530                throw new RuntimeException(e);
531            }
532            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
533            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
534            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
535            return clone;
536        }
537    }
538
539    /**
540     * Get the document's current output settings.
541     * @return the document's current output settings.
542     */
543    public OutputSettings outputSettings() {
544        return outputSettings;
545    }
546
547    /**
548     * Set the document's output settings.
549     * @param outputSettings new output settings.
550     * @return this document, for chaining.
551     */
552    public Document outputSettings(OutputSettings outputSettings) {
553        Validate.notNull(outputSettings);
554        this.outputSettings = outputSettings;
555        return this;
556    }
557
558    public enum QuirksMode {
559        noQuirks, quirks, limitedQuirks
560    }
561
562    public QuirksMode quirksMode() {
563        return quirksMode;
564    }
565
566    public Document quirksMode(QuirksMode quirksMode) {
567        this.quirksMode = quirksMode;
568        return this;
569    }
570
571    /**
572     * Get the parser that was used to parse this document.
573     * @return the parser
574     */
575    public Parser parser() {
576        return parser;
577    }
578
579    /**
580     * Set the parser used to create this document. This parser is then used when further parsing within this document
581     * is required.
582     * @param parser the configured parser to use when further parsing is required for this document.
583     * @return this document, for chaining.
584     */
585    public Document parser(Parser parser) {
586        this.parser = parser;
587        return this;
588    }
589
590    /**
591     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
592     made (e.g. when a form is submitted).
593
594     @param connection to set
595     @return this document, for chaining
596     @see Connection#newRequest()
597     @since 1.14.1
598     */
599    public Document connection(Connection connection) {
600        Validate.notNull(connection);
601        this.connection = connection;
602        return this;
603    }
604}