001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceHtml; 020 021/** 022 A HTML Document. 023 024 @author Jonathan Hedley, jonathan@hedley.net */ 025public class Document extends Element { 026 private @Nullable Connection connection; // the connection this doc was fetched from, if any 027 private OutputSettings outputSettings = new OutputSettings(); 028 private Parser parser; // the parser used to parse this document 029 private QuirksMode quirksMode = QuirksMode.noQuirks; 030 private final String location; 031 032 /** 033 Create a new, empty Document, in the specified namespace. 034 @param namespace the namespace of this Document's root node. 035 @param baseUri base URI of document 036 @see org.jsoup.Jsoup#parse 037 @see #createShell 038 */ 039 public Document(String namespace, String baseUri) { 040 super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri); 041 this.location = baseUri; 042 this.parser = Parser.htmlParser(); // default, but overridable 043 } 044 045 /** 046 Create a new, empty Document, in the HTML namespace. 047 @param baseUri base URI of document 048 @see org.jsoup.Jsoup#parse 049 @see #Document(String namespace, String baseUri) 050 */ 051 public Document(String baseUri) { 052 this(NamespaceHtml, baseUri); 053 } 054 055 /** 056 Create a valid, empty shell of an HTML document, suitable for adding more elements to. 057 @param baseUri baseUri of document 058 @return document with html, head, and body elements. 059 */ 060 public static Document createShell(String baseUri) { 061 Validate.notNull(baseUri); 062 063 Document doc = new Document(baseUri); 064 Element html = doc.appendElement("html"); 065 html.appendElement("head"); 066 html.appendElement("body"); 067 068 return doc; 069 } 070 071 /** 072 * Get the URL this Document was parsed from. If the starting URL is a redirect, 073 * this will return the final URL from which the document was served from. 074 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 075 * @return location 076 */ 077 public String location() { 078 return location; 079 } 080 081 /** 082 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 083 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 084 @return the Connection (session) associated with this Document, or an empty one otherwise. 085 @see Connection#newRequest() 086 */ 087 public Connection connection() { 088 if (connection == null) 089 return Jsoup.newSession(); 090 else 091 return connection; 092 } 093 094 /** 095 * Returns this Document's doctype. 096 * @return document type, or null if not set 097 */ 098 public @Nullable DocumentType documentType() { 099 for (Node node : childNodes) { 100 if (node instanceof DocumentType) 101 return (DocumentType) node; 102 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 103 break; 104 } 105 return null; 106 // todo - add a set document type? 107 } 108 109 /** 110 Find the root HTML element, or create it if it doesn't exist. 111 @return the root HTML element. 112 */ 113 private Element htmlEl() { 114 Element el = firstElementChild(); 115 while (el != null) { 116 if (el.nameIs("html")) 117 return el; 118 el = el.nextElementSibling(); 119 } 120 return appendElement("html"); 121 } 122 123 /** 124 Get this document's {@code head} element. 125 <p> 126 As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want 127 that, use {@code #selectFirst("head")} instead. 128 129 @return {@code head} element. 130 */ 131 public Element head() { 132 final Element html = htmlEl(); 133 Element el = html.firstElementChild(); 134 while (el != null) { 135 if (el.nameIs("head")) 136 return el; 137 el = el.nextElementSibling(); 138 } 139 return html.prependElement("head"); 140 } 141 142 /** 143 Get this document's {@code <body>} or {@code <frameset>} element. 144 <p> 145 As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code 146 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 147 148 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 149 had no contents, or the outermost {@code <frameset> element} for frameset documents. 150 */ 151 public Element body() { 152 final Element html = htmlEl(); 153 Element el = html.firstElementChild(); 154 while (el != null) { 155 if (el.nameIs("body") || el.nameIs("frameset")) 156 return el; 157 el = el.nextElementSibling(); 158 } 159 return html.appendElement("body"); 160 } 161 162 /** 163 Get each of the {@code <form>} elements contained in this document. 164 @return a List of FormElement objects, which will be empty if there are none. 165 @see Elements#forms() 166 @see FormElement#elements() 167 @since 1.15.4 168 */ 169 public List<FormElement> forms() { 170 return select("form").forms(); 171 } 172 173 /** 174 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 175 {@link IllegalArgumentException}. 176 @param cssQuery a {@link Selector} CSS query 177 @return the first matching {@code <form>} element 178 @throws IllegalArgumentException if no match is found 179 @since 1.15.4 180 */ 181 public FormElement expectForm(String cssQuery) { 182 Elements els = select(cssQuery); 183 for (Element el : els) { 184 if (el instanceof FormElement) return (FormElement) el; 185 } 186 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 187 return null; // (not really) 188 } 189 190 /** 191 Get the string contents of the document's {@code title} element. 192 @return Trimmed title, or empty string if none set. 193 */ 194 public String title() { 195 // title is a preserve whitespace tag (for document output), but normalised here 196 Element titleEl = head().selectFirst(titleEval); 197 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 198 } 199 private static final Evaluator titleEval = new Evaluator.Tag("title"); 200 201 /** 202 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 203 not present 204 @param title string to set as title 205 */ 206 public void title(String title) { 207 Validate.notNull(title); 208 Element titleEl = head().selectFirst(titleEval); 209 if (titleEl == null) // add to head 210 titleEl = head().appendElement("title"); 211 titleEl.text(title); 212 } 213 214 /** 215 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 216 @param tagName element tag name (e.g. {@code a}) 217 @return new element 218 */ 219 public Element createElement(String tagName) { 220 return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); 221 } 222 223 @Override 224 public String outerHtml() { 225 return super.html(); // no outer wrapper tag 226 } 227 228 /** 229 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 230 @param text un-encoded text 231 @return this document 232 */ 233 @Override 234 public Element text(String text) { 235 body().text(text); // overridden to not nuke doc structure 236 return this; 237 } 238 239 @Override 240 public String nodeName() { 241 return "#document"; 242 } 243 244 /** 245 Set the output character set of this Document. This method is equivalent to 246 {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or 247 updates the charset / encoding element within the Document. 248 249 <p>If there's no existing element with charset / encoding information yet, one will 250 be created. Obsolete charset / encoding definitions are removed.</p> 251 252 <p><b>Elements used:</b></p> 253 254 <ul> 255 <li><b>HTML:</b> <i><meta charset="CHARSET"></i></li> 256 <li><b>XML:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 257 </ul> 258 259 @param charset Charset 260 @see OutputSettings#charset(java.nio.charset.Charset) 261 */ 262 public void charset(Charset charset) { 263 outputSettings.charset(charset); 264 ensureMetaCharsetElement(); 265 } 266 267 /** 268 Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}. 269 270 @return the current Charset 271 @see OutputSettings#charset() 272 */ 273 public Charset charset() { 274 return outputSettings.charset(); 275 } 276 277 /** 278 @deprecated this setting has no effect; the meta charset element is always updated when 279 {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1. 280 */ 281 @Deprecated 282 public void updateMetaCharsetElement(boolean noop) {} 283 284 /** 285 @deprecated this setting has no effect; the meta charset element is always updated when 286 {@link Document#charset(Charset)} is called. This method will be removed in jsoup 1.20.1. 287 */ 288 @Deprecated 289 public boolean updateMetaCharsetElement() { 290 return true; 291 } 292 293 @Override 294 public Document clone() { 295 Document clone = (Document) super.clone(); 296 clone.outputSettings = this.outputSettings.clone(); 297 clone.parser = this.parser.clone(); 298 return clone; 299 } 300 301 @Override 302 public Document shallowClone() { 303 Document clone = new Document(this.tag().namespace(), baseUri()); 304 if (attributes != null) 305 clone.attributes = attributes.clone(); 306 clone.outputSettings = this.outputSettings.clone(); 307 return clone; 308 } 309 310 311 private void ensureMetaCharsetElement() { 312 OutputSettings.Syntax syntax = outputSettings().syntax(); 313 314 if (syntax == OutputSettings.Syntax.html) { 315 Element metaCharset = selectFirst("meta[charset]"); 316 if (metaCharset != null) { 317 metaCharset.attr("charset", charset().displayName()); 318 } else { 319 head().appendElement("meta").attr("charset", charset().displayName()); 320 } 321 select("meta[name=charset]").remove(); // Remove obsolete elements 322 } else if (syntax == OutputSettings.Syntax.xml) { 323 XmlDeclaration decl = ensureXmlDecl(); 324 decl.attr("version", "1.0"); 325 decl.attr("encoding", charset().displayName()); 326 } 327 } 328 329 private XmlDeclaration ensureXmlDecl() { 330 Node node = firstChild(); 331 if (node instanceof XmlDeclaration) { 332 XmlDeclaration decl = (XmlDeclaration) node; 333 if (decl.name().equals("xml")) return decl; 334 } 335 XmlDeclaration decl = new XmlDeclaration("xml", false); 336 prependChild(decl); 337 return decl; 338 } 339 340 341 /** 342 * A Document's output settings control the form of the text() and html() methods. 343 */ 344 public static class OutputSettings implements Cloneable { 345 /** 346 * The output serialization syntax. 347 */ 348 public enum Syntax {html, xml} 349 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 350 private Charset charset = DataUtil.UTF_8; 351 private boolean prettyPrint = true; 352 private boolean outline = false; 353 private int indentAmount = 1; 354 private int maxPaddingWidth = 30; 355 private Syntax syntax = Syntax.html; 356 357 /** 358 Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, 359 indent amount of 1). 360 */ 361 public OutputSettings() { 362 } 363 364 /** 365 Get the document's current entity escape mode: 366 <ul> 367 <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li> 368 <li><code>base</code>, which provides a limited set of named HTML 369 entities and escapes other characters as numbered entities for maximum compatibility</li> 370 <li><code>extended</code>, 371 which uses the complete set of HTML named entities.</li> 372 </ul> 373 <p>The default escape mode is <code>base</code>. 374 @return the document's current escape mode 375 */ 376 public Entities.EscapeMode escapeMode() { 377 return escapeMode; 378 } 379 380 /** 381 * Set the document's escape mode, which determines how characters are escaped when the output character set 382 * does not support a given character:- using either a named or a numbered escape. 383 * @param escapeMode the new escape mode to use 384 * @return the document's output settings, for chaining 385 */ 386 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 387 this.escapeMode = escapeMode; 388 return this; 389 } 390 391 /** 392 * Get the document's current output charset, which is used to control which characters are escaped when 393 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 394 * <p> 395 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 396 * input charset. Otherwise, it defaults to UTF-8. 397 * @return the document's current charset. 398 */ 399 public Charset charset() { 400 return charset; 401 } 402 403 /** 404 * Update the document's output charset. 405 * @param charset the new charset to use. 406 * @return the document's output settings, for chaining 407 */ 408 public OutputSettings charset(Charset charset) { 409 this.charset = charset; 410 return this; 411 } 412 413 /** 414 * Update the document's output charset. 415 * @param charset the new charset (by name) to use. 416 * @return the document's output settings, for chaining 417 */ 418 public OutputSettings charset(String charset) { 419 charset(Charset.forName(charset)); 420 return this; 421 } 422 423 /** 424 * Get the document's current output syntax. 425 * @return current syntax 426 */ 427 public Syntax syntax() { 428 return syntax; 429 } 430 431 /** 432 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 433 * {@code xml}, with self-closing tags. 434 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 435 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 436 * @param syntax serialization syntax 437 * @return the document's output settings, for chaining 438 */ 439 public OutputSettings syntax(Syntax syntax) { 440 this.syntax = syntax; 441 if (syntax == Syntax.xml) 442 this.escapeMode(Entities.EscapeMode.xhtml); 443 return this; 444 } 445 446 /** 447 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 448 * the output, and the output will generally look like the input. 449 * @return if pretty printing is enabled. 450 */ 451 public boolean prettyPrint() { 452 return prettyPrint; 453 } 454 455 /** 456 * Enable or disable pretty printing. 457 * @param pretty new pretty print setting 458 * @return this, for chaining 459 */ 460 public OutputSettings prettyPrint(boolean pretty) { 461 prettyPrint = pretty; 462 return this; 463 } 464 465 /** 466 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 467 * all tags as block. 468 * @return if outline mode is enabled. 469 */ 470 public boolean outline() { 471 return outline; 472 } 473 474 /** 475 * Enable or disable HTML outline mode. 476 * @param outlineMode new outline setting 477 * @return this, for chaining 478 */ 479 public OutputSettings outline(boolean outlineMode) { 480 outline = outlineMode; 481 return this; 482 } 483 484 /** 485 * Get the current tag indent amount, used when pretty printing. 486 * @return the current indent amount 487 */ 488 public int indentAmount() { 489 return indentAmount; 490 } 491 492 /** 493 * Set the indent amount for pretty printing 494 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 495 * @return this, for chaining 496 */ 497 public OutputSettings indentAmount(int indentAmount) { 498 Validate.isTrue(indentAmount >= 0); 499 this.indentAmount = indentAmount; 500 return this; 501 } 502 503 /** 504 * Get the current max padding amount, used when pretty printing 505 * so very deeply nested nodes don't get insane padding amounts. 506 * @return the current indent amount 507 */ 508 public int maxPaddingWidth() { 509 return maxPaddingWidth; 510 } 511 512 /** 513 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 514 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 515 * Default is 30 and -1 means unlimited. 516 * @return this, for chaining 517 */ 518 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 519 Validate.isTrue(maxPaddingWidth >= -1); 520 this.maxPaddingWidth = maxPaddingWidth; 521 return this; 522 } 523 524 @Override 525 public OutputSettings clone() { 526 OutputSettings clone; 527 try { 528 clone = (OutputSettings) super.clone(); 529 } catch (CloneNotSupportedException e) { 530 throw new RuntimeException(e); 531 } 532 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 533 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 534 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 535 return clone; 536 } 537 } 538 539 /** 540 * Get the document's current output settings. 541 * @return the document's current output settings. 542 */ 543 public OutputSettings outputSettings() { 544 return outputSettings; 545 } 546 547 /** 548 * Set the document's output settings. 549 * @param outputSettings new output settings. 550 * @return this document, for chaining. 551 */ 552 public Document outputSettings(OutputSettings outputSettings) { 553 Validate.notNull(outputSettings); 554 this.outputSettings = outputSettings; 555 return this; 556 } 557 558 public enum QuirksMode { 559 noQuirks, quirks, limitedQuirks 560 } 561 562 public QuirksMode quirksMode() { 563 return quirksMode; 564 } 565 566 public Document quirksMode(QuirksMode quirksMode) { 567 this.quirksMode = quirksMode; 568 return this; 569 } 570 571 /** 572 * Get the parser that was used to parse this document. 573 * @return the parser 574 */ 575 public Parser parser() { 576 return parser; 577 } 578 579 /** 580 * Set the parser used to create this document. This parser is then used when further parsing within this document 581 * is required. 582 * @param parser the configured parser to use when further parsing is required for this document. 583 * @return this document, for chaining. 584 */ 585 public Document parser(Parser parser) { 586 this.parser = parser; 587 return this; 588 } 589 590 /** 591 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 592 made (e.g. when a form is submitted). 593 594 @param connection to set 595 @return this document, for chaining 596 @see Connection#newRequest() 597 @since 1.14.1 598 */ 599 public Document connection(Connection connection) { 600 Validate.notNull(connection); 601 this.connection = connection; 602 return this; 603 } 604}