001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceHtml; 020 021/** 022 A HTML Document. 023 024 @author Jonathan Hedley, jonathan@hedley.net */ 025public class Document extends Element { 026 private @Nullable Connection connection; // the connection this doc was fetched from, if any 027 private OutputSettings outputSettings = new OutputSettings(); 028 private Parser parser; // the parser used to parse this document 029 private QuirksMode quirksMode = QuirksMode.noQuirks; 030 private final String location; 031 032 /** 033 Create a new, empty Document, in the specified namespace. 034 @param namespace the namespace of this Document's root node. 035 @param baseUri base URI of document 036 @see org.jsoup.Jsoup#parse 037 @see #createShell 038 */ 039 public Document(String namespace, String baseUri) { 040 this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable 041 } 042 043 private Document(String namespace, String baseUri, Parser parser) { 044 super(new Tag("#root", namespace), baseUri); 045 this.location = baseUri; 046 this.parser = parser; 047 } 048 049 /** 050 Create a new, empty Document, in the HTML namespace. 051 @param baseUri base URI of document 052 @see org.jsoup.Jsoup#parse 053 @see #Document(String namespace, String baseUri) 054 */ 055 public Document(String baseUri) { 056 this(NamespaceHtml, baseUri); 057 } 058 059 /** 060 Create a valid, empty shell of an HTML document, suitable for adding more elements to. 061 @param baseUri baseUri of document 062 @return document with html, head, and body elements. 063 */ 064 public static Document createShell(String baseUri) { 065 Validate.notNull(baseUri); 066 067 Document doc = new Document(baseUri); 068 Element html = doc.appendElement("html"); 069 html.appendElement("head"); 070 html.appendElement("body"); 071 072 return doc; 073 } 074 075 /** 076 * Get the URL this Document was parsed from. If the starting URL is a redirect, 077 * this will return the final URL from which the document was served from. 078 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 079 * @return location 080 */ 081 public String location() { 082 return location; 083 } 084 085 /** 086 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 087 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 088 @return the Connection (session) associated with this Document, or an empty one otherwise. 089 @see Connection#newRequest() 090 */ 091 public Connection connection() { 092 if (connection == null) 093 return Jsoup.newSession(); 094 else 095 return connection; 096 } 097 098 /** 099 * Returns this Document's doctype. 100 * @return document type, or null if not set 101 */ 102 public @Nullable DocumentType documentType() { 103 for (Node node : childNodes) { 104 if (node instanceof DocumentType) 105 return (DocumentType) node; 106 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 107 break; 108 } 109 return null; 110 // todo - add a set document type? 111 } 112 113 /** 114 Find the root HTML element, or create it if it doesn't exist. 115 @return the root HTML element. 116 */ 117 private Element htmlEl() { 118 Element el = firstElementChild(); 119 while (el != null) { 120 if (el.nameIs("html")) 121 return el; 122 el = el.nextElementSibling(); 123 } 124 return appendElement("html"); 125 } 126 127 /** 128 Get this document's {@code head} element. 129 <p> 130 As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want 131 that, use {@code #selectFirst("head")} instead. 132 133 @return {@code head} element. 134 */ 135 public Element head() { 136 final Element html = htmlEl(); 137 Element el = html.firstElementChild(); 138 while (el != null) { 139 if (el.nameIs("head")) 140 return el; 141 el = el.nextElementSibling(); 142 } 143 return html.prependElement("head"); 144 } 145 146 /** 147 Get this document's {@code <body>} or {@code <frameset>} element. 148 <p> 149 As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code 150 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 151 152 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 153 had no contents, or the outermost {@code <frameset> element} for frameset documents. 154 */ 155 public Element body() { 156 final Element html = htmlEl(); 157 Element el = html.firstElementChild(); 158 while (el != null) { 159 if (el.nameIs("body") || el.nameIs("frameset")) 160 return el; 161 el = el.nextElementSibling(); 162 } 163 return html.appendElement("body"); 164 } 165 166 /** 167 Get each of the {@code <form>} elements contained in this document. 168 @return a List of FormElement objects, which will be empty if there are none. 169 @see Elements#forms() 170 @see FormElement#elements() 171 @since 1.15.4 172 */ 173 public List<FormElement> forms() { 174 return select("form").forms(); 175 } 176 177 /** 178 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 179 {@link IllegalArgumentException}. 180 @param cssQuery a {@link Selector} CSS query 181 @return the first matching {@code <form>} element 182 @throws IllegalArgumentException if no match is found 183 @since 1.15.4 184 */ 185 public FormElement expectForm(String cssQuery) { 186 Elements els = select(cssQuery); 187 for (Element el : els) { 188 if (el instanceof FormElement) return (FormElement) el; 189 } 190 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 191 return null; // (not really) 192 } 193 194 /** 195 Get the string contents of the document's {@code title} element. 196 @return Trimmed title, or empty string if none set. 197 */ 198 public String title() { 199 // title is a preserve whitespace tag (for document output), but normalised here 200 Element titleEl = head().selectFirst(titleEval); 201 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 202 } 203 private static final Evaluator titleEval = new Evaluator.Tag("title"); 204 205 /** 206 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 207 not present 208 @param title string to set as title 209 */ 210 public void title(String title) { 211 Validate.notNull(title); 212 Element titleEl = head().selectFirst(titleEval); 213 if (titleEl == null) // add to head 214 titleEl = head().appendElement("title"); 215 titleEl.text(title); 216 } 217 218 /** 219 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 220 @param tagName element tag name (e.g. {@code a}) 221 @return new element 222 */ 223 public Element createElement(String tagName) { 224 return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); 225 } 226 227 @Override 228 public String outerHtml() { 229 return super.html(); // no outer wrapper tag 230 } 231 232 /** 233 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 234 @param text un-encoded text 235 @return this document 236 */ 237 @Override 238 public Element text(String text) { 239 body().text(text); // overridden to not nuke doc structure 240 return this; 241 } 242 243 @Override 244 public String nodeName() { 245 return "#document"; 246 } 247 248 /** 249 Set the output character set of this Document. This method is equivalent to 250 {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or 251 updates the charset / encoding element within the Document. 252 253 <p>If there's no existing element with charset / encoding information yet, one will 254 be created. Obsolete charset / encoding definitions are removed.</p> 255 256 <p><b>Elements used:</b></p> 257 258 <ul> 259 <li><b>HTML:</b> <i><meta charset="CHARSET"></i></li> 260 <li><b>XML:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 261 </ul> 262 263 @param charset Charset 264 @see OutputSettings#charset(java.nio.charset.Charset) 265 */ 266 public void charset(Charset charset) { 267 outputSettings.charset(charset); 268 ensureMetaCharsetElement(); 269 } 270 271 /** 272 Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}. 273 274 @return the current Charset 275 @see OutputSettings#charset() 276 */ 277 public Charset charset() { 278 return outputSettings.charset(); 279 } 280 281 /** 282 @deprecated this setting has no effect; the meta charset element is always updated when 283 {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1. 284 */ 285 @Deprecated 286 public void updateMetaCharsetElement(boolean noop) {} 287 288 /** 289 @deprecated this setting has no effect; the meta charset element is always updated when 290 {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1. 291 */ 292 @Deprecated 293 public boolean updateMetaCharsetElement() { 294 return true; 295 } 296 297 @Override 298 public Document clone() { 299 Document clone = (Document) super.clone(); 300 if (attributes != null) clone.attributes = attributes.clone(); 301 clone.outputSettings = this.outputSettings.clone(); 302 // parser is pointer copy 303 return clone; 304 } 305 306 @Override 307 public Document shallowClone() { 308 Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer 309 if (attributes != null) clone.attributes = attributes.clone(); 310 clone.outputSettings = this.outputSettings.clone(); 311 return clone; 312 } 313 314 315 private void ensureMetaCharsetElement() { 316 OutputSettings.Syntax syntax = outputSettings().syntax(); 317 318 if (syntax == OutputSettings.Syntax.html) { 319 Element metaCharset = selectFirst("meta[charset]"); 320 if (metaCharset != null) { 321 metaCharset.attr("charset", charset().displayName()); 322 } else { 323 head().appendElement("meta").attr("charset", charset().displayName()); 324 } 325 select("meta[name=charset]").remove(); // Remove obsolete elements 326 } else if (syntax == OutputSettings.Syntax.xml) { 327 XmlDeclaration decl = ensureXmlDecl(); 328 decl.attr("version", "1.0"); 329 decl.attr("encoding", charset().displayName()); 330 } 331 } 332 333 private XmlDeclaration ensureXmlDecl() { 334 Node node = firstChild(); 335 if (node instanceof XmlDeclaration) { 336 XmlDeclaration decl = (XmlDeclaration) node; 337 if (decl.name().equals("xml")) return decl; 338 } 339 XmlDeclaration decl = new XmlDeclaration("xml", false); 340 prependChild(decl); 341 return decl; 342 } 343 344 345 /** 346 * A Document's output settings control the form of the text() and html() methods. 347 */ 348 public static class OutputSettings implements Cloneable { 349 /** 350 * The output serialization syntax. 351 */ 352 public enum Syntax {html, xml} 353 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 354 private Charset charset = DataUtil.UTF_8; 355 private boolean prettyPrint = true; 356 private boolean outline = false; 357 private int indentAmount = 1; 358 private int maxPaddingWidth = 30; 359 private Syntax syntax = Syntax.html; 360 361 /** 362 Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, 363 indent amount of 1). 364 */ 365 public OutputSettings() { 366 } 367 368 /** 369 Get the document's current entity escape mode: 370 <ul> 371 <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li> 372 <li><code>base</code>, which provides a limited set of named HTML 373 entities and escapes other characters as numbered entities for maximum compatibility</li> 374 <li><code>extended</code>, 375 which uses the complete set of HTML named entities.</li> 376 </ul> 377 <p>The default escape mode is <code>base</code>. 378 @return the document's current escape mode 379 */ 380 public Entities.EscapeMode escapeMode() { 381 return escapeMode; 382 } 383 384 /** 385 * Set the document's escape mode, which determines how characters are escaped when the output character set 386 * does not support a given character:- using either a named or a numbered escape. 387 * @param escapeMode the new escape mode to use 388 * @return the document's output settings, for chaining 389 */ 390 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 391 this.escapeMode = escapeMode; 392 return this; 393 } 394 395 /** 396 * Get the document's current output charset, which is used to control which characters are escaped when 397 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 398 * <p> 399 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 400 * input charset. Otherwise, it defaults to UTF-8. 401 * @return the document's current charset. 402 */ 403 public Charset charset() { 404 return charset; 405 } 406 407 /** 408 * Update the document's output charset. 409 * @param charset the new charset to use. 410 * @return the document's output settings, for chaining 411 */ 412 public OutputSettings charset(Charset charset) { 413 this.charset = charset; 414 return this; 415 } 416 417 /** 418 * Update the document's output charset. 419 * @param charset the new charset (by name) to use. 420 * @return the document's output settings, for chaining 421 */ 422 public OutputSettings charset(String charset) { 423 charset(Charset.forName(charset)); 424 return this; 425 } 426 427 /** 428 * Get the document's current output syntax. 429 * @return current syntax 430 */ 431 public Syntax syntax() { 432 return syntax; 433 } 434 435 /** 436 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 437 * {@code xml}, with self-closing tags. 438 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 439 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 440 * @param syntax serialization syntax 441 * @return the document's output settings, for chaining 442 */ 443 public OutputSettings syntax(Syntax syntax) { 444 this.syntax = syntax; 445 if (syntax == Syntax.xml) 446 this.escapeMode(Entities.EscapeMode.xhtml); 447 return this; 448 } 449 450 /** 451 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 452 * the output, and the output will generally look like the input. 453 * @return if pretty printing is enabled. 454 */ 455 public boolean prettyPrint() { 456 return prettyPrint; 457 } 458 459 /** 460 * Enable or disable pretty printing. 461 * @param pretty new pretty print setting 462 * @return this, for chaining 463 */ 464 public OutputSettings prettyPrint(boolean pretty) { 465 prettyPrint = pretty; 466 return this; 467 } 468 469 /** 470 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 471 * all tags as block. 472 * @return if outline mode is enabled. 473 */ 474 public boolean outline() { 475 return outline; 476 } 477 478 /** 479 * Enable or disable HTML outline mode. 480 * @param outlineMode new outline setting 481 * @return this, for chaining 482 */ 483 public OutputSettings outline(boolean outlineMode) { 484 outline = outlineMode; 485 return this; 486 } 487 488 /** 489 * Get the current tag indent amount, used when pretty printing. 490 * @return the current indent amount 491 */ 492 public int indentAmount() { 493 return indentAmount; 494 } 495 496 /** 497 * Set the indent amount for pretty printing 498 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 499 * @return this, for chaining 500 */ 501 public OutputSettings indentAmount(int indentAmount) { 502 Validate.isTrue(indentAmount >= 0); 503 this.indentAmount = indentAmount; 504 return this; 505 } 506 507 /** 508 * Get the current max padding amount, used when pretty printing 509 * so very deeply nested nodes don't get insane padding amounts. 510 * @return the current indent amount 511 */ 512 public int maxPaddingWidth() { 513 return maxPaddingWidth; 514 } 515 516 /** 517 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 518 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 519 * Default is 30 and -1 means unlimited. 520 * @return this, for chaining 521 */ 522 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 523 Validate.isTrue(maxPaddingWidth >= -1); 524 this.maxPaddingWidth = maxPaddingWidth; 525 return this; 526 } 527 528 @Override 529 public OutputSettings clone() { 530 OutputSettings clone; 531 try { 532 clone = (OutputSettings) super.clone(); 533 } catch (CloneNotSupportedException e) { 534 throw new RuntimeException(e); 535 } 536 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 537 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 538 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 539 return clone; 540 } 541 } 542 543 /** 544 * Get the document's current output settings. 545 * @return the document's current output settings. 546 */ 547 public OutputSettings outputSettings() { 548 return outputSettings; 549 } 550 551 /** 552 * Set the document's output settings. 553 * @param outputSettings new output settings. 554 * @return this document, for chaining. 555 */ 556 public Document outputSettings(OutputSettings outputSettings) { 557 Validate.notNull(outputSettings); 558 this.outputSettings = outputSettings; 559 return this; 560 } 561 562 public enum QuirksMode { 563 noQuirks, quirks, limitedQuirks 564 } 565 566 public QuirksMode quirksMode() { 567 return quirksMode; 568 } 569 570 public Document quirksMode(QuirksMode quirksMode) { 571 this.quirksMode = quirksMode; 572 return this; 573 } 574 575 /** 576 * Get the parser that was used to parse this document. 577 * @return the parser 578 */ 579 public Parser parser() { 580 return parser; 581 } 582 583 /** 584 * Set the parser used to create this document. This parser is then used when further parsing within this document 585 * is required. 586 * @param parser the configured parser to use when further parsing is required for this document. 587 * @return this document, for chaining. 588 */ 589 public Document parser(Parser parser) { 590 this.parser = parser; 591 return this; 592 } 593 594 /** 595 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 596 made (e.g. when a form is submitted). 597 598 @param connection to set 599 @return this document, for chaining 600 @see Connection#newRequest() 601 @since 1.14.1 602 */ 603 public Document connection(Connection connection) { 604 Validate.notNull(connection); 605 this.connection = connection; 606 return this; 607 } 608}