001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceHtml;
020
021/**
022 A HTML Document.
023
024 @author Jonathan Hedley, jonathan@hedley.net */
025public class Document extends Element {
026    private @Nullable Connection connection; // the connection this doc was fetched from, if any
027    private OutputSettings outputSettings = new OutputSettings();
028    private Parser parser; // the parser used to parse this document
029    private QuirksMode quirksMode = QuirksMode.noQuirks;
030    private final String location;
031    private boolean updateMetaCharset = false;
032
033    /**
034     Create a new, empty Document, in the specified namespace.
035     @param namespace the namespace of this Document's root node.
036     @param baseUri base URI of document
037     @see org.jsoup.Jsoup#parse
038     @see #createShell
039     */
040    public Document(String namespace, String baseUri) {
041        super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
042        this.location = baseUri;
043        this.parser = Parser.htmlParser(); // default, but overridable
044    }
045
046    /**
047     Create a new, empty Document, in the HTML namespace.
048     @param baseUri base URI of document
049     @see org.jsoup.Jsoup#parse
050     @see #Document(String namespace, String baseUri)
051     */
052    public Document(String baseUri) {
053        this(NamespaceHtml, baseUri);
054    }
055
056    /**
057     Create a valid, empty shell of a document, suitable for adding more elements to.
058     @param baseUri baseUri of document
059     @return document with html, head, and body elements.
060     */
061    public static Document createShell(String baseUri) {
062        Validate.notNull(baseUri);
063
064        Document doc = new Document(baseUri);
065        Element html = doc.appendElement("html");
066        html.appendElement("head");
067        html.appendElement("body");
068
069        return doc;
070    }
071
072    /**
073     * Get the URL this Document was parsed from. If the starting URL is a redirect,
074     * this will return the final URL from which the document was served from.
075     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
076     * @return location
077     */
078    public String location() {
079        return location;
080    }
081
082    /**
083     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
084     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
085     @return the Connection (session) associated with this Document, or an empty one otherwise.
086     @see Connection#newRequest()
087     */
088    public Connection connection() {
089        if (connection == null)
090            return Jsoup.newSession();
091        else
092            return connection;
093    }
094
095    /**
096     * Returns this Document's doctype.
097     * @return document type, or null if not set
098     */
099    public @Nullable DocumentType documentType() {
100        for (Node node : childNodes) {
101            if (node instanceof DocumentType)
102                return (DocumentType) node;
103            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
104                break;
105        }
106        return null;
107        // todo - add a set document type?
108    }
109
110    /**
111     Find the root HTML element, or create it if it doesn't exist.
112     @return the root HTML element.
113     */
114    private Element htmlEl() {
115        Element el = firstElementChild();
116        while (el != null) {
117            if (el.nameIs("html"))
118                return el;
119            el = el.nextElementSibling();
120        }
121        return appendElement("html");
122    }
123
124    /**
125     Get this document's {@code head} element.
126     <p>
127     As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want
128     that, use {@code #selectFirst("head")} instead.
129
130     @return {@code head} element.
131     */
132    public Element head() {
133        final Element html = htmlEl();
134        Element el = html.firstElementChild();
135        while (el != null) {
136            if (el.nameIs("head"))
137                return el;
138            el = el.nextElementSibling();
139        }
140        return html.prependElement("head");
141    }
142
143    /**
144     Get this document's {@code <body>} or {@code <frameset>} element.
145     <p>
146     As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code
147    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
148
149     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
150     had no contents, or the outermost {@code <frameset> element} for frameset documents.
151     */
152    public Element body() {
153        final Element html = htmlEl();
154        Element el = html.firstElementChild();
155        while (el != null) {
156            if (el.nameIs("body") || el.nameIs("frameset"))
157                return el;
158            el = el.nextElementSibling();
159        }
160        return html.appendElement("body");
161    }
162
163    /**
164     Get each of the {@code <form>} elements contained in this document.
165     @return a List of FormElement objects, which will be empty if there are none.
166     @see Elements#forms()
167     @see FormElement#elements()
168     @since 1.15.4
169     */
170    public List<FormElement> forms() {
171        return select("form").forms();
172    }
173
174    /**
175     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
176     {@link IllegalArgumentException}.
177     @param cssQuery a {@link Selector} CSS query
178     @return the first matching {@code <form>} element
179     @throws IllegalArgumentException if no match is found
180     @since 1.15.4
181     */
182    public FormElement expectForm(String cssQuery) {
183        Elements els = select(cssQuery);
184        for (Element el : els) {
185            if (el instanceof FormElement) return (FormElement) el;
186        }
187        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
188        return null; // (not really)
189    }
190
191    /**
192     Get the string contents of the document's {@code title} element.
193     @return Trimmed title, or empty string if none set.
194     */
195    public String title() {
196        // title is a preserve whitespace tag (for document output), but normalised here
197        Element titleEl = head().selectFirst(titleEval);
198        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
199    }
200    private static final Evaluator titleEval = new Evaluator.Tag("title");
201
202    /**
203     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
204     not present
205     @param title string to set as title
206     */
207    public void title(String title) {
208        Validate.notNull(title);
209        Element titleEl = head().selectFirst(titleEval);
210        if (titleEl == null) // add to head
211            titleEl = head().appendElement("title");
212        titleEl.text(title);
213    }
214
215    /**
216     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
217     @param tagName element tag name (e.g. {@code a})
218     @return new element
219     */
220    public Element createElement(String tagName) {
221        return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
222    }
223
224    @Override
225    public String outerHtml() {
226        return super.html(); // no outer wrapper tag
227    }
228
229    /**
230     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
231     @param text un-encoded text
232     @return this document
233     */
234    @Override
235    public Element text(String text) {
236        body().text(text); // overridden to not nuke doc structure
237        return this;
238    }
239
240    @Override
241    public String nodeName() {
242        return "#document";
243    }
244    
245    /**
246     * Sets the charset used in this document. This method is equivalent
247     * to {@link OutputSettings#charset(java.nio.charset.Charset)
248     * OutputSettings.charset(Charset)} but in addition it updates the
249     * charset / encoding element within the document.
250     * 
251     * <p>This enables
252     * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
253     * 
254     * <p>If there's no element with charset / encoding information yet it will
255     * be created. Obsolete charset / encoding definitions are removed!</p>
256     * 
257     * <p><b>Elements used:</b></p>
258     * 
259     * <ul>
260     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
261     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
262     * </ul>
263     * 
264     * @param charset Charset
265     * 
266     * @see #updateMetaCharsetElement(boolean) 
267     * @see OutputSettings#charset(java.nio.charset.Charset) 
268     */
269    public void charset(Charset charset) {
270        updateMetaCharsetElement(true);
271        outputSettings.charset(charset);
272        ensureMetaCharsetElement();
273    }
274    
275    /**
276     * Returns the charset used in this document. This method is equivalent
277     * to {@link OutputSettings#charset()}.
278     * 
279     * @return Current Charset
280     * 
281     * @see OutputSettings#charset() 
282     */
283    public Charset charset() {
284        return outputSettings.charset();
285    }
286    
287    /**
288     * Sets whether the element with charset information in this document is
289     * updated on changes through {@link #charset(java.nio.charset.Charset)
290     * Document.charset(Charset)} or not.
291     * 
292     * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
293     * modified.</p>
294     * 
295     * @param update If <tt>true</tt> the element updated on charset
296     * changes, <tt>false</tt> if not
297     * 
298     * @see #charset(java.nio.charset.Charset) 
299     */
300    public void updateMetaCharsetElement(boolean update) {
301        this.updateMetaCharset = update;
302    }
303    
304    /**
305     * Returns whether the element with charset information in this document is
306     * updated on changes through {@link #charset(java.nio.charset.Charset)
307     * Document.charset(Charset)} or not.
308     * 
309     * @return Returns <tt>true</tt> if the element is updated on charset
310     * changes, <tt>false</tt> if not
311     */
312    public boolean updateMetaCharsetElement() {
313        return updateMetaCharset;
314    }
315
316    @Override
317    public Document clone() {
318        Document clone = (Document) super.clone();
319        clone.outputSettings = this.outputSettings.clone();
320        return clone;
321    }
322
323    @Override
324    public Document shallowClone() {
325        Document clone = new Document(this.tag().namespace(), baseUri());
326        if (attributes != null)
327            clone.attributes = attributes.clone();
328        clone.outputSettings = this.outputSettings.clone();
329        return clone;
330    }
331    
332    /**
333     * Ensures a meta charset (html) or xml declaration (xml) with the current
334     * encoding used. This only applies with
335     * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
336     * <tt>true</tt>, otherwise this method does nothing.
337     * 
338     * <ul>
339     * <li>An existing element gets updated with the current charset</li>
340     * <li>If there's no element yet it will be inserted</li>
341     * <li>Obsolete elements are removed</li>
342     * </ul>
343     * 
344     * <p><b>Elements used:</b></p>
345     * 
346     * <ul>
347     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
348     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
349     * </ul>
350     */
351    private void ensureMetaCharsetElement() {
352        if (updateMetaCharset) {
353            OutputSettings.Syntax syntax = outputSettings().syntax();
354
355            if (syntax == OutputSettings.Syntax.html) {
356                Element metaCharset = selectFirst("meta[charset]");
357                if (metaCharset != null) {
358                    metaCharset.attr("charset", charset().displayName());
359                } else {
360                    head().appendElement("meta").attr("charset", charset().displayName());
361                }
362                select("meta[name=charset]").remove(); // Remove obsolete elements
363            } else if (syntax == OutputSettings.Syntax.xml) {
364                Node node = ensureChildNodes().get(0);
365                if (node instanceof XmlDeclaration) {
366                    XmlDeclaration decl = (XmlDeclaration) node;
367                    if (decl.name().equals("xml")) {
368                        decl.attr("encoding", charset().displayName());
369                        if (decl.hasAttr("version"))
370                            decl.attr("version", "1.0");
371                    } else {
372                        decl = new XmlDeclaration("xml", false);
373                        decl.attr("version", "1.0");
374                        decl.attr("encoding", charset().displayName());
375                        prependChild(decl);
376                    }
377                } else {
378                    XmlDeclaration decl = new XmlDeclaration("xml", false);
379                    decl.attr("version", "1.0");
380                    decl.attr("encoding", charset().displayName());
381                    prependChild(decl);
382                }
383            }
384        }
385    }
386    
387
388    /**
389     * A Document's output settings control the form of the text() and html() methods.
390     */
391    public static class OutputSettings implements Cloneable {
392        /**
393         * The output serialization syntax.
394         */
395        public enum Syntax {html, xml}
396        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
397        private Charset charset = DataUtil.UTF_8;
398        private boolean prettyPrint = true;
399        private boolean outline = false;
400        private int indentAmount = 1;
401        private int maxPaddingWidth = 30;
402        private Syntax syntax = Syntax.html;
403
404        /**
405         Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing,
406         indent amount of 1).
407         */
408        public OutputSettings() {
409        }
410
411        /**
412         Get the document's current entity escape mode:
413         <ul>
414         <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li>
415         <li><code>base</code>, which provides a limited set of named HTML
416         entities and escapes other characters as numbered entities for maximum compatibility</li>
417         <li><code>extended</code>,
418         which uses the complete set of HTML named entities.</li>
419         </ul>
420         <p>The default escape mode is <code>base</code>.
421         @return the document's current escape mode
422         */
423        public Entities.EscapeMode escapeMode() {
424            return escapeMode;
425        }
426
427        /**
428         * Set the document's escape mode, which determines how characters are escaped when the output character set
429         * does not support a given character:- using either a named or a numbered escape.
430         * @param escapeMode the new escape mode to use
431         * @return the document's output settings, for chaining
432         */
433        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
434            this.escapeMode = escapeMode;
435            return this;
436        }
437
438        /**
439         * Get the document's current output charset, which is used to control which characters are escaped when
440         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
441         * <p>
442         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
443         * input charset. Otherwise, it defaults to UTF-8.
444         * @return the document's current charset.
445         */
446        public Charset charset() {
447            return charset;
448        }
449
450        /**
451         * Update the document's output charset.
452         * @param charset the new charset to use.
453         * @return the document's output settings, for chaining
454         */
455        public OutputSettings charset(Charset charset) {
456            this.charset = charset;
457            return this;
458        }
459
460        /**
461         * Update the document's output charset.
462         * @param charset the new charset (by name) to use.
463         * @return the document's output settings, for chaining
464         */
465        public OutputSettings charset(String charset) {
466            charset(Charset.forName(charset));
467            return this;
468        }
469
470        /**
471         * Get the document's current output syntax.
472         * @return current syntax
473         */
474        public Syntax syntax() {
475            return syntax;
476        }
477
478        /**
479         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
480         * {@code xml}, with self-closing tags.
481         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
482         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
483         * @param syntax serialization syntax
484         * @return the document's output settings, for chaining
485         */
486        public OutputSettings syntax(Syntax syntax) {
487            this.syntax = syntax;
488            if (syntax == Syntax.xml)
489                this.escapeMode(Entities.EscapeMode.xhtml);
490            return this;
491        }
492
493        /**
494         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
495         * the output, and the output will generally look like the input.
496         * @return if pretty printing is enabled.
497         */
498        public boolean prettyPrint() {
499            return prettyPrint;
500        }
501
502        /**
503         * Enable or disable pretty printing.
504         * @param pretty new pretty print setting
505         * @return this, for chaining
506         */
507        public OutputSettings prettyPrint(boolean pretty) {
508            prettyPrint = pretty;
509            return this;
510        }
511        
512        /**
513         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
514         * all tags as block.
515         * @return if outline mode is enabled.
516         */
517        public boolean outline() {
518            return outline;
519        }
520        
521        /**
522         * Enable or disable HTML outline mode.
523         * @param outlineMode new outline setting
524         * @return this, for chaining
525         */
526        public OutputSettings outline(boolean outlineMode) {
527            outline = outlineMode;
528            return this;
529        }
530
531        /**
532         * Get the current tag indent amount, used when pretty printing.
533         * @return the current indent amount
534         */
535        public int indentAmount() {
536            return indentAmount;
537        }
538
539        /**
540         * Set the indent amount for pretty printing
541         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
542         * @return this, for chaining
543         */
544        public OutputSettings indentAmount(int indentAmount) {
545            Validate.isTrue(indentAmount >= 0);
546            this.indentAmount = indentAmount;
547            return this;
548        }
549
550        /**
551         * Get the current max padding amount, used when pretty printing
552         * so very deeply nested nodes don't get insane padding amounts.
553         * @return the current indent amount
554         */
555        public int maxPaddingWidth() {
556            return maxPaddingWidth;
557        }
558
559        /**
560         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
561         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
562         *        Default is 30 and -1 means unlimited.
563         * @return this, for chaining
564         */
565        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
566            Validate.isTrue(maxPaddingWidth >= -1);
567            this.maxPaddingWidth = maxPaddingWidth;
568            return this;
569        }
570
571        @Override
572        public OutputSettings clone() {
573            OutputSettings clone;
574            try {
575                clone = (OutputSettings) super.clone();
576            } catch (CloneNotSupportedException e) {
577                throw new RuntimeException(e);
578            }
579            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
580            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
581            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
582            return clone;
583        }
584    }
585
586    /**
587     * Get the document's current output settings.
588     * @return the document's current output settings.
589     */
590    public OutputSettings outputSettings() {
591        return outputSettings;
592    }
593
594    /**
595     * Set the document's output settings.
596     * @param outputSettings new output settings.
597     * @return this document, for chaining.
598     */
599    public Document outputSettings(OutputSettings outputSettings) {
600        Validate.notNull(outputSettings);
601        this.outputSettings = outputSettings;
602        return this;
603    }
604
605    public enum QuirksMode {
606        noQuirks, quirks, limitedQuirks
607    }
608
609    public QuirksMode quirksMode() {
610        return quirksMode;
611    }
612
613    public Document quirksMode(QuirksMode quirksMode) {
614        this.quirksMode = quirksMode;
615        return this;
616    }
617
618    /**
619     * Get the parser that was used to parse this document.
620     * @return the parser
621     */
622    public Parser parser() {
623        return parser;
624    }
625
626    /**
627     * Set the parser used to create this document. This parser is then used when further parsing within this document
628     * is required.
629     * @param parser the configured parser to use when further parsing is required for this document.
630     * @return this document, for chaining.
631     */
632    public Document parser(Parser parser) {
633        this.parser = parser;
634        return this;
635    }
636
637    /**
638     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
639     made (e.g. when a form is submitted).
640
641     @param connection to set
642     @return this document, for chaining
643     @see Connection#newRequest()
644     @since 1.14.1
645     */
646    public Document connection(Connection connection) {
647        Validate.notNull(connection);
648        this.connection = connection;
649        return this;
650    }
651}