001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceHtml; 020 021/** 022 A HTML Document. 023 024 @author Jonathan Hedley, jonathan@hedley.net */ 025public class Document extends Element { 026 private @Nullable Connection connection; // the connection this doc was fetched from, if any 027 private OutputSettings outputSettings = new OutputSettings(); 028 private Parser parser; // the parser used to parse this document 029 private QuirksMode quirksMode = QuirksMode.noQuirks; 030 private final String location; 031 private boolean updateMetaCharset = false; 032 033 /** 034 Create a new, empty Document, in the specified namespace. 035 @param namespace the namespace of this Document's root node. 036 @param baseUri base URI of document 037 @see org.jsoup.Jsoup#parse 038 @see #createShell 039 */ 040 public Document(String namespace, String baseUri) { 041 super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri); 042 this.location = baseUri; 043 this.parser = Parser.htmlParser(); // default, but overridable 044 } 045 046 /** 047 Create a new, empty Document, in the HTML namespace. 048 @param baseUri base URI of document 049 @see org.jsoup.Jsoup#parse 050 @see #Document(String namespace, String baseUri) 051 */ 052 public Document(String baseUri) { 053 this(NamespaceHtml, baseUri); 054 } 055 056 /** 057 Create a valid, empty shell of a document, suitable for adding more elements to. 058 @param baseUri baseUri of document 059 @return document with html, head, and body elements. 060 */ 061 public static Document createShell(String baseUri) { 062 Validate.notNull(baseUri); 063 064 Document doc = new Document(baseUri); 065 Element html = doc.appendElement("html"); 066 html.appendElement("head"); 067 html.appendElement("body"); 068 069 return doc; 070 } 071 072 /** 073 * Get the URL this Document was parsed from. If the starting URL is a redirect, 074 * this will return the final URL from which the document was served from. 075 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 076 * @return location 077 */ 078 public String location() { 079 return location; 080 } 081 082 /** 083 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 084 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 085 @return the Connection (session) associated with this Document, or an empty one otherwise. 086 @see Connection#newRequest() 087 */ 088 public Connection connection() { 089 if (connection == null) 090 return Jsoup.newSession(); 091 else 092 return connection; 093 } 094 095 /** 096 * Returns this Document's doctype. 097 * @return document type, or null if not set 098 */ 099 public @Nullable DocumentType documentType() { 100 for (Node node : childNodes) { 101 if (node instanceof DocumentType) 102 return (DocumentType) node; 103 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 104 break; 105 } 106 return null; 107 // todo - add a set document type? 108 } 109 110 /** 111 Find the root HTML element, or create it if it doesn't exist. 112 @return the root HTML element. 113 */ 114 private Element htmlEl() { 115 Element el = firstElementChild(); 116 while (el != null) { 117 if (el.nameIs("html")) 118 return el; 119 el = el.nextElementSibling(); 120 } 121 return appendElement("html"); 122 } 123 124 /** 125 Get this document's {@code head} element. 126 <p> 127 As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want 128 that, use {@code #selectFirst("head")} instead. 129 130 @return {@code head} element. 131 */ 132 public Element head() { 133 final Element html = htmlEl(); 134 Element el = html.firstElementChild(); 135 while (el != null) { 136 if (el.nameIs("head")) 137 return el; 138 el = el.nextElementSibling(); 139 } 140 return html.prependElement("head"); 141 } 142 143 /** 144 Get this document's {@code <body>} or {@code <frameset>} element. 145 <p> 146 As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code 147 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 148 149 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 150 had no contents, or the outermost {@code <frameset> element} for frameset documents. 151 */ 152 public Element body() { 153 final Element html = htmlEl(); 154 Element el = html.firstElementChild(); 155 while (el != null) { 156 if (el.nameIs("body") || el.nameIs("frameset")) 157 return el; 158 el = el.nextElementSibling(); 159 } 160 return html.appendElement("body"); 161 } 162 163 /** 164 Get each of the {@code <form>} elements contained in this document. 165 @return a List of FormElement objects, which will be empty if there are none. 166 @see Elements#forms() 167 @see FormElement#elements() 168 @since 1.15.4 169 */ 170 public List<FormElement> forms() { 171 return select("form").forms(); 172 } 173 174 /** 175 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 176 {@link IllegalArgumentException}. 177 @param cssQuery a {@link Selector} CSS query 178 @return the first matching {@code <form>} element 179 @throws IllegalArgumentException if no match is found 180 @since 1.15.4 181 */ 182 public FormElement expectForm(String cssQuery) { 183 Elements els = select(cssQuery); 184 for (Element el : els) { 185 if (el instanceof FormElement) return (FormElement) el; 186 } 187 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 188 return null; // (not really) 189 } 190 191 /** 192 Get the string contents of the document's {@code title} element. 193 @return Trimmed title, or empty string if none set. 194 */ 195 public String title() { 196 // title is a preserve whitespace tag (for document output), but normalised here 197 Element titleEl = head().selectFirst(titleEval); 198 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 199 } 200 private static final Evaluator titleEval = new Evaluator.Tag("title"); 201 202 /** 203 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 204 not present 205 @param title string to set as title 206 */ 207 public void title(String title) { 208 Validate.notNull(title); 209 Element titleEl = head().selectFirst(titleEval); 210 if (titleEl == null) // add to head 211 titleEl = head().appendElement("title"); 212 titleEl.text(title); 213 } 214 215 /** 216 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 217 @param tagName element tag name (e.g. {@code a}) 218 @return new element 219 */ 220 public Element createElement(String tagName) { 221 return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); 222 } 223 224 @Override 225 public String outerHtml() { 226 return super.html(); // no outer wrapper tag 227 } 228 229 /** 230 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 231 @param text un-encoded text 232 @return this document 233 */ 234 @Override 235 public Element text(String text) { 236 body().text(text); // overridden to not nuke doc structure 237 return this; 238 } 239 240 @Override 241 public String nodeName() { 242 return "#document"; 243 } 244 245 /** 246 * Sets the charset used in this document. This method is equivalent 247 * to {@link OutputSettings#charset(java.nio.charset.Charset) 248 * OutputSettings.charset(Charset)} but in addition it updates the 249 * charset / encoding element within the document. 250 * 251 * <p>This enables 252 * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p> 253 * 254 * <p>If there's no element with charset / encoding information yet it will 255 * be created. Obsolete charset / encoding definitions are removed!</p> 256 * 257 * <p><b>Elements used:</b></p> 258 * 259 * <ul> 260 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 261 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 262 * </ul> 263 * 264 * @param charset Charset 265 * 266 * @see #updateMetaCharsetElement(boolean) 267 * @see OutputSettings#charset(java.nio.charset.Charset) 268 */ 269 public void charset(Charset charset) { 270 updateMetaCharsetElement(true); 271 outputSettings.charset(charset); 272 ensureMetaCharsetElement(); 273 } 274 275 /** 276 * Returns the charset used in this document. This method is equivalent 277 * to {@link OutputSettings#charset()}. 278 * 279 * @return Current Charset 280 * 281 * @see OutputSettings#charset() 282 */ 283 public Charset charset() { 284 return outputSettings.charset(); 285 } 286 287 /** 288 * Sets whether the element with charset information in this document is 289 * updated on changes through {@link #charset(java.nio.charset.Charset) 290 * Document.charset(Charset)} or not. 291 * 292 * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements 293 * modified.</p> 294 * 295 * @param update If <tt>true</tt> the element updated on charset 296 * changes, <tt>false</tt> if not 297 * 298 * @see #charset(java.nio.charset.Charset) 299 */ 300 public void updateMetaCharsetElement(boolean update) { 301 this.updateMetaCharset = update; 302 } 303 304 /** 305 * Returns whether the element with charset information in this document is 306 * updated on changes through {@link #charset(java.nio.charset.Charset) 307 * Document.charset(Charset)} or not. 308 * 309 * @return Returns <tt>true</tt> if the element is updated on charset 310 * changes, <tt>false</tt> if not 311 */ 312 public boolean updateMetaCharsetElement() { 313 return updateMetaCharset; 314 } 315 316 @Override 317 public Document clone() { 318 Document clone = (Document) super.clone(); 319 clone.outputSettings = this.outputSettings.clone(); 320 return clone; 321 } 322 323 @Override 324 public Document shallowClone() { 325 Document clone = new Document(this.tag().namespace(), baseUri()); 326 if (attributes != null) 327 clone.attributes = attributes.clone(); 328 clone.outputSettings = this.outputSettings.clone(); 329 return clone; 330 } 331 332 /** 333 * Ensures a meta charset (html) or xml declaration (xml) with the current 334 * encoding used. This only applies with 335 * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to 336 * <tt>true</tt>, otherwise this method does nothing. 337 * 338 * <ul> 339 * <li>An existing element gets updated with the current charset</li> 340 * <li>If there's no element yet it will be inserted</li> 341 * <li>Obsolete elements are removed</li> 342 * </ul> 343 * 344 * <p><b>Elements used:</b></p> 345 * 346 * <ul> 347 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 348 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 349 * </ul> 350 */ 351 private void ensureMetaCharsetElement() { 352 if (updateMetaCharset) { 353 OutputSettings.Syntax syntax = outputSettings().syntax(); 354 355 if (syntax == OutputSettings.Syntax.html) { 356 Element metaCharset = selectFirst("meta[charset]"); 357 if (metaCharset != null) { 358 metaCharset.attr("charset", charset().displayName()); 359 } else { 360 head().appendElement("meta").attr("charset", charset().displayName()); 361 } 362 select("meta[name=charset]").remove(); // Remove obsolete elements 363 } else if (syntax == OutputSettings.Syntax.xml) { 364 Node node = ensureChildNodes().get(0); 365 if (node instanceof XmlDeclaration) { 366 XmlDeclaration decl = (XmlDeclaration) node; 367 if (decl.name().equals("xml")) { 368 decl.attr("encoding", charset().displayName()); 369 if (decl.hasAttr("version")) 370 decl.attr("version", "1.0"); 371 } else { 372 decl = new XmlDeclaration("xml", false); 373 decl.attr("version", "1.0"); 374 decl.attr("encoding", charset().displayName()); 375 prependChild(decl); 376 } 377 } else { 378 XmlDeclaration decl = new XmlDeclaration("xml", false); 379 decl.attr("version", "1.0"); 380 decl.attr("encoding", charset().displayName()); 381 prependChild(decl); 382 } 383 } 384 } 385 } 386 387 388 /** 389 * A Document's output settings control the form of the text() and html() methods. 390 */ 391 public static class OutputSettings implements Cloneable { 392 /** 393 * The output serialization syntax. 394 */ 395 public enum Syntax {html, xml} 396 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 397 private Charset charset = DataUtil.UTF_8; 398 private boolean prettyPrint = true; 399 private boolean outline = false; 400 private int indentAmount = 1; 401 private int maxPaddingWidth = 30; 402 private Syntax syntax = Syntax.html; 403 404 /** 405 Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, 406 indent amount of 1). 407 */ 408 public OutputSettings() { 409 } 410 411 /** 412 Get the document's current entity escape mode: 413 <ul> 414 <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li> 415 <li><code>base</code>, which provides a limited set of named HTML 416 entities and escapes other characters as numbered entities for maximum compatibility</li> 417 <li><code>extended</code>, 418 which uses the complete set of HTML named entities.</li> 419 </ul> 420 <p>The default escape mode is <code>base</code>. 421 @return the document's current escape mode 422 */ 423 public Entities.EscapeMode escapeMode() { 424 return escapeMode; 425 } 426 427 /** 428 * Set the document's escape mode, which determines how characters are escaped when the output character set 429 * does not support a given character:- using either a named or a numbered escape. 430 * @param escapeMode the new escape mode to use 431 * @return the document's output settings, for chaining 432 */ 433 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 434 this.escapeMode = escapeMode; 435 return this; 436 } 437 438 /** 439 * Get the document's current output charset, which is used to control which characters are escaped when 440 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 441 * <p> 442 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 443 * input charset. Otherwise, it defaults to UTF-8. 444 * @return the document's current charset. 445 */ 446 public Charset charset() { 447 return charset; 448 } 449 450 /** 451 * Update the document's output charset. 452 * @param charset the new charset to use. 453 * @return the document's output settings, for chaining 454 */ 455 public OutputSettings charset(Charset charset) { 456 this.charset = charset; 457 return this; 458 } 459 460 /** 461 * Update the document's output charset. 462 * @param charset the new charset (by name) to use. 463 * @return the document's output settings, for chaining 464 */ 465 public OutputSettings charset(String charset) { 466 charset(Charset.forName(charset)); 467 return this; 468 } 469 470 /** 471 * Get the document's current output syntax. 472 * @return current syntax 473 */ 474 public Syntax syntax() { 475 return syntax; 476 } 477 478 /** 479 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 480 * {@code xml}, with self-closing tags. 481 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 482 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 483 * @param syntax serialization syntax 484 * @return the document's output settings, for chaining 485 */ 486 public OutputSettings syntax(Syntax syntax) { 487 this.syntax = syntax; 488 if (syntax == Syntax.xml) 489 this.escapeMode(Entities.EscapeMode.xhtml); 490 return this; 491 } 492 493 /** 494 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 495 * the output, and the output will generally look like the input. 496 * @return if pretty printing is enabled. 497 */ 498 public boolean prettyPrint() { 499 return prettyPrint; 500 } 501 502 /** 503 * Enable or disable pretty printing. 504 * @param pretty new pretty print setting 505 * @return this, for chaining 506 */ 507 public OutputSettings prettyPrint(boolean pretty) { 508 prettyPrint = pretty; 509 return this; 510 } 511 512 /** 513 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 514 * all tags as block. 515 * @return if outline mode is enabled. 516 */ 517 public boolean outline() { 518 return outline; 519 } 520 521 /** 522 * Enable or disable HTML outline mode. 523 * @param outlineMode new outline setting 524 * @return this, for chaining 525 */ 526 public OutputSettings outline(boolean outlineMode) { 527 outline = outlineMode; 528 return this; 529 } 530 531 /** 532 * Get the current tag indent amount, used when pretty printing. 533 * @return the current indent amount 534 */ 535 public int indentAmount() { 536 return indentAmount; 537 } 538 539 /** 540 * Set the indent amount for pretty printing 541 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 542 * @return this, for chaining 543 */ 544 public OutputSettings indentAmount(int indentAmount) { 545 Validate.isTrue(indentAmount >= 0); 546 this.indentAmount = indentAmount; 547 return this; 548 } 549 550 /** 551 * Get the current max padding amount, used when pretty printing 552 * so very deeply nested nodes don't get insane padding amounts. 553 * @return the current indent amount 554 */ 555 public int maxPaddingWidth() { 556 return maxPaddingWidth; 557 } 558 559 /** 560 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 561 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 562 * Default is 30 and -1 means unlimited. 563 * @return this, for chaining 564 */ 565 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 566 Validate.isTrue(maxPaddingWidth >= -1); 567 this.maxPaddingWidth = maxPaddingWidth; 568 return this; 569 } 570 571 @Override 572 public OutputSettings clone() { 573 OutputSettings clone; 574 try { 575 clone = (OutputSettings) super.clone(); 576 } catch (CloneNotSupportedException e) { 577 throw new RuntimeException(e); 578 } 579 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 580 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 581 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 582 return clone; 583 } 584 } 585 586 /** 587 * Get the document's current output settings. 588 * @return the document's current output settings. 589 */ 590 public OutputSettings outputSettings() { 591 return outputSettings; 592 } 593 594 /** 595 * Set the document's output settings. 596 * @param outputSettings new output settings. 597 * @return this document, for chaining. 598 */ 599 public Document outputSettings(OutputSettings outputSettings) { 600 Validate.notNull(outputSettings); 601 this.outputSettings = outputSettings; 602 return this; 603 } 604 605 public enum QuirksMode { 606 noQuirks, quirks, limitedQuirks 607 } 608 609 public QuirksMode quirksMode() { 610 return quirksMode; 611 } 612 613 public Document quirksMode(QuirksMode quirksMode) { 614 this.quirksMode = quirksMode; 615 return this; 616 } 617 618 /** 619 * Get the parser that was used to parse this document. 620 * @return the parser 621 */ 622 public Parser parser() { 623 return parser; 624 } 625 626 /** 627 * Set the parser used to create this document. This parser is then used when further parsing within this document 628 * is required. 629 * @param parser the configured parser to use when further parsing is required for this document. 630 * @return this document, for chaining. 631 */ 632 public Document parser(Parser parser) { 633 this.parser = parser; 634 return this; 635 } 636 637 /** 638 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 639 made (e.g. when a form is submitted). 640 641 @param connection to set 642 @return this document, for chaining 643 @see Connection#newRequest() 644 @since 1.14.1 645 */ 646 public Document connection(Connection connection) { 647 Validate.notNull(connection); 648 this.connection = connection; 649 return this; 650 } 651}