001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceHtml; 020 021/** 022 A HTML Document. 023 024 @author Jonathan Hedley, jonathan@hedley.net */ 025public class Document extends Element { 026 private @Nullable Connection connection; // the connection this doc was fetched from, if any 027 private OutputSettings outputSettings = new OutputSettings(); 028 private Parser parser; // the parser used to parse this document 029 private QuirksMode quirksMode = QuirksMode.noQuirks; 030 private final String location; 031 032 /** 033 Create a new, empty Document, in the specified namespace. 034 @param namespace the namespace of this Document's root node. 035 @param baseUri base URI of document 036 @see org.jsoup.Jsoup#parse 037 @see #createShell 038 */ 039 public Document(String namespace, String baseUri) { 040 this(namespace, baseUri, Parser.htmlParser()); // default HTML parser, but overridable 041 } 042 043 private Document(String namespace, String baseUri, Parser parser) { 044 super(new Tag("#root", namespace), baseUri); 045 this.location = baseUri; 046 this.parser = parser; 047 } 048 049 /** 050 Create a new, empty Document, in the HTML namespace. 051 @param baseUri base URI of document 052 @see org.jsoup.Jsoup#parse 053 @see #Document(String namespace, String baseUri) 054 */ 055 public Document(String baseUri) { 056 this(NamespaceHtml, baseUri); 057 } 058 059 /** 060 Create a valid, empty shell of an HTML document, suitable for adding more elements to. 061 @param baseUri baseUri of document 062 @return document with html, head, and body elements. 063 */ 064 public static Document createShell(String baseUri) { 065 Validate.notNull(baseUri); 066 067 Document doc = new Document(baseUri); 068 Element html = doc.appendElement("html"); 069 html.appendElement("head"); 070 html.appendElement("body"); 071 072 return doc; 073 } 074 075 /** 076 * Get the URL this Document was parsed from. If the starting URL is a redirect, 077 * this will return the final URL from which the document was served from. 078 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 079 * @return location 080 */ 081 public String location() { 082 return location; 083 } 084 085 /** 086 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 087 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 088 @return the Connection (session) associated with this Document, or an empty one otherwise. 089 @see Connection#newRequest() 090 */ 091 public Connection connection() { 092 if (connection == null) 093 return Jsoup.newSession(); 094 else 095 return connection; 096 } 097 098 /** 099 * Returns this Document's doctype. 100 * @return document type, or null if not set 101 */ 102 public @Nullable DocumentType documentType() { 103 for (Node node : childNodes) { 104 if (node instanceof DocumentType) 105 return (DocumentType) node; 106 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 107 break; 108 } 109 return null; 110 } 111 112 /** 113 Find the root HTML element, or create it if it doesn't exist. 114 @return the root HTML element. 115 */ 116 private Element htmlEl() { 117 Element el = firstElementChild(); 118 while (el != null) { 119 if (el.nameIs("html")) 120 return el; 121 el = el.nextElementSibling(); 122 } 123 return appendElement("html"); 124 } 125 126 /** 127 Get this document's {@code head} element. 128 <p> 129 As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want 130 that, use {@code #selectFirst("head")} instead. 131 132 @return {@code head} element. 133 */ 134 public Element head() { 135 final Element html = htmlEl(); 136 Element el = html.firstElementChild(); 137 while (el != null) { 138 if (el.nameIs("head")) 139 return el; 140 el = el.nextElementSibling(); 141 } 142 return html.prependElement("head"); 143 } 144 145 /** 146 Get this document's {@code <body>} or {@code <frameset>} element. 147 <p> 148 As a <b>side-effect</b>, if this Document does not already have an HTML structure, it will be created with a {@code 149 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 150 151 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 152 had no contents, or the outermost {@code <frameset> element} for frameset documents. 153 */ 154 public Element body() { 155 final Element html = htmlEl(); 156 Element el = html.firstElementChild(); 157 while (el != null) { 158 if (el.nameIs("body") || el.nameIs("frameset")) 159 return el; 160 el = el.nextElementSibling(); 161 } 162 return html.appendElement("body"); 163 } 164 165 /** 166 Get each of the {@code <form>} elements contained in this document. 167 @return a List of FormElement objects, which will be empty if there are none. 168 @see Elements#forms() 169 @see FormElement#elements() 170 @since 1.15.4 171 */ 172 public List<FormElement> forms() { 173 return select("form").forms(); 174 } 175 176 /** 177 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 178 {@link IllegalArgumentException}. 179 @param cssQuery a {@link Selector} CSS query 180 @return the first matching {@code <form>} element 181 @throws IllegalArgumentException if no match is found 182 @since 1.15.4 183 */ 184 public FormElement expectForm(String cssQuery) { 185 Elements els = select(cssQuery); 186 for (Element el : els) { 187 if (el instanceof FormElement) return (FormElement) el; 188 } 189 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 190 return null; // (not really) 191 } 192 193 /** 194 Get the string contents of the document's {@code title} element. 195 @return Trimmed title, or empty string if none set. 196 */ 197 public String title() { 198 // title is a preserve whitespace tag (for document output), but normalised here 199 Element titleEl = head().selectFirst(titleEval); 200 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 201 } 202 private static final Evaluator titleEval = new Evaluator.Tag("title"); 203 204 /** 205 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 206 not present 207 @param title string to set as title 208 */ 209 public void title(String title) { 210 Validate.notNull(title); 211 Element titleEl = head().selectFirst(titleEval); 212 if (titleEl == null) // add to head 213 titleEl = head().appendElement("title"); 214 titleEl.text(title); 215 } 216 217 /** 218 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 219 @param tagName element tag name (e.g. {@code a}) 220 @return new element 221 */ 222 public Element createElement(String tagName) { 223 return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); 224 } 225 226 @Override 227 public String outerHtml() { 228 return super.html(); // no outer wrapper tag 229 } 230 231 /** 232 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 233 @param text un-encoded text 234 @return this document 235 */ 236 @Override 237 public Element text(String text) { 238 body().text(text); // overridden to not nuke doc structure 239 return this; 240 } 241 242 @Override 243 public String nodeName() { 244 return "#document"; 245 } 246 247 /** 248 Set the output character set of this Document. This method is equivalent to 249 {@link OutputSettings#charset(java.nio.charset.Charset) OutputSettings.charset(Charset)}, but additionally adds or 250 updates the charset / encoding element within the Document. 251 252 <p>If there's no existing element with charset / encoding information yet, one will 253 be created. Obsolete charset / encoding definitions are removed.</p> 254 255 <p><b>Elements used:</b></p> 256 257 <ul> 258 <li><b>HTML:</b> <i><meta charset="CHARSET"></i></li> 259 <li><b>XML:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 260 </ul> 261 262 @param charset Charset 263 @see OutputSettings#charset(java.nio.charset.Charset) 264 */ 265 public void charset(Charset charset) { 266 outputSettings.charset(charset); 267 ensureMetaCharsetElement(); 268 } 269 270 /** 271 Get the output character set of this Document. This method is equivalent to {@link OutputSettings#charset()}. 272 273 @return the current Charset 274 @see OutputSettings#charset() 275 */ 276 public Charset charset() { 277 return outputSettings.charset(); 278 } 279 280 @Override 281 public Document clone() { 282 Document clone = (Document) super.clone(); 283 if (attributes != null) clone.attributes = attributes.clone(); 284 clone.outputSettings = this.outputSettings.clone(); 285 // parser is pointer copy 286 return clone; 287 } 288 289 @Override 290 public Document shallowClone() { 291 Document clone = new Document(this.tag().namespace(), baseUri(), parser); // preserves parser pointer 292 if (attributes != null) clone.attributes = attributes.clone(); 293 clone.outputSettings = this.outputSettings.clone(); 294 return clone; 295 } 296 297 298 private void ensureMetaCharsetElement() { 299 OutputSettings.Syntax syntax = outputSettings().syntax(); 300 301 if (syntax == OutputSettings.Syntax.html) { 302 Element metaCharset = selectFirst("meta[charset]"); 303 if (metaCharset != null) { 304 metaCharset.attr("charset", charset().displayName()); 305 } else { 306 head().appendElement("meta").attr("charset", charset().displayName()); 307 } 308 select("meta[name=charset]").remove(); // Remove obsolete elements 309 } else if (syntax == OutputSettings.Syntax.xml) { 310 XmlDeclaration decl = ensureXmlDecl(); 311 decl.attr("version", "1.0"); 312 decl.attr("encoding", charset().displayName()); 313 } 314 } 315 316 private XmlDeclaration ensureXmlDecl() { 317 Node node = firstChild(); 318 if (node instanceof XmlDeclaration) { 319 XmlDeclaration decl = (XmlDeclaration) node; 320 if (decl.name().equals("xml")) return decl; 321 } 322 XmlDeclaration decl = new XmlDeclaration("xml", false); 323 prependChild(decl); 324 return decl; 325 } 326 327 328 /** 329 * A Document's output settings control the form of the text() and html() methods. 330 */ 331 public static class OutputSettings implements Cloneable { 332 /** 333 * The output serialization syntax. 334 */ 335 public enum Syntax {html, xml} 336 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 337 private Charset charset = DataUtil.UTF_8; 338 private boolean prettyPrint = true; 339 private boolean outline = false; 340 private int indentAmount = 1; 341 private int maxPaddingWidth = 30; 342 private Syntax syntax = Syntax.html; 343 344 /** 345 Create a new OutputSettings object, with the default settings (UTF-8, HTML, EscapeMode.base, pretty-printing, 346 indent amount of 1). 347 */ 348 public OutputSettings() { 349 } 350 351 /** 352 Get the document's current entity escape mode: 353 <ul> 354 <li><code>xhtml</code>, the minimal named entities in XHTML / XML</li> 355 <li><code>base</code>, which provides a limited set of named HTML 356 entities and escapes other characters as numbered entities for maximum compatibility</li> 357 <li><code>extended</code>, 358 which uses the complete set of HTML named entities.</li> 359 </ul> 360 <p>The default escape mode is <code>base</code>. 361 @return the document's current escape mode 362 */ 363 public Entities.EscapeMode escapeMode() { 364 return escapeMode; 365 } 366 367 /** 368 * Set the document's escape mode, which determines how characters are escaped when the output character set 369 * does not support a given character:- using either a named or a numbered escape. 370 * @param escapeMode the new escape mode to use 371 * @return the document's output settings, for chaining 372 */ 373 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 374 this.escapeMode = escapeMode; 375 return this; 376 } 377 378 /** 379 * Get the document's current output charset, which is used to control which characters are escaped when 380 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 381 * <p> 382 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 383 * input charset. Otherwise, it defaults to UTF-8. 384 * @return the document's current charset. 385 */ 386 public Charset charset() { 387 return charset; 388 } 389 390 /** 391 * Update the document's output charset. 392 * @param charset the new charset to use. 393 * @return the document's output settings, for chaining 394 */ 395 public OutputSettings charset(Charset charset) { 396 this.charset = charset; 397 return this; 398 } 399 400 /** 401 * Update the document's output charset. 402 * @param charset the new charset (by name) to use. 403 * @return the document's output settings, for chaining 404 */ 405 public OutputSettings charset(String charset) { 406 charset(Charset.forName(charset)); 407 return this; 408 } 409 410 /** 411 * Get the document's current output syntax. 412 * @return current syntax 413 */ 414 public Syntax syntax() { 415 return syntax; 416 } 417 418 /** 419 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 420 * {@code xml}, with self-closing tags. 421 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 422 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 423 * @param syntax serialization syntax 424 * @return the document's output settings, for chaining 425 */ 426 public OutputSettings syntax(Syntax syntax) { 427 this.syntax = syntax; 428 if (syntax == Syntax.xml) 429 this.escapeMode(Entities.EscapeMode.xhtml); 430 return this; 431 } 432 433 /** 434 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 435 * the output, and the output will generally look like the input. 436 * @return if pretty printing is enabled. 437 */ 438 public boolean prettyPrint() { 439 return prettyPrint; 440 } 441 442 /** 443 * Enable or disable pretty printing. 444 * @param pretty new pretty print setting 445 * @return this, for chaining 446 */ 447 public OutputSettings prettyPrint(boolean pretty) { 448 prettyPrint = pretty; 449 return this; 450 } 451 452 /** 453 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 454 * all tags as block. 455 * @return if outline mode is enabled. 456 */ 457 public boolean outline() { 458 return outline; 459 } 460 461 /** 462 * Enable or disable HTML outline mode. 463 * @param outlineMode new outline setting 464 * @return this, for chaining 465 */ 466 public OutputSettings outline(boolean outlineMode) { 467 outline = outlineMode; 468 return this; 469 } 470 471 /** 472 * Get the current tag indent amount, used when pretty printing. 473 * @return the current indent amount 474 */ 475 public int indentAmount() { 476 return indentAmount; 477 } 478 479 /** 480 * Set the indent amount for pretty printing 481 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 482 * @return this, for chaining 483 */ 484 public OutputSettings indentAmount(int indentAmount) { 485 Validate.isTrue(indentAmount >= 0); 486 this.indentAmount = indentAmount; 487 return this; 488 } 489 490 /** 491 * Get the current max padding amount, used when pretty printing 492 * so very deeply nested nodes don't get insane padding amounts. 493 * @return the current indent amount 494 */ 495 public int maxPaddingWidth() { 496 return maxPaddingWidth; 497 } 498 499 /** 500 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 501 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 502 * Default is 30 and -1 means unlimited. 503 * @return this, for chaining 504 */ 505 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 506 Validate.isTrue(maxPaddingWidth >= -1); 507 this.maxPaddingWidth = maxPaddingWidth; 508 return this; 509 } 510 511 @Override 512 public OutputSettings clone() { 513 OutputSettings clone; 514 try { 515 clone = (OutputSettings) super.clone(); 516 } catch (CloneNotSupportedException e) { 517 throw new RuntimeException(e); 518 } 519 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 520 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 521 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 522 return clone; 523 } 524 } 525 526 /** 527 * Get the document's current output settings. 528 * @return the document's current output settings. 529 */ 530 public OutputSettings outputSettings() { 531 return outputSettings; 532 } 533 534 /** 535 * Set the document's output settings. 536 * @param outputSettings new output settings. 537 * @return this document, for chaining. 538 */ 539 public Document outputSettings(OutputSettings outputSettings) { 540 Validate.notNull(outputSettings); 541 this.outputSettings = outputSettings; 542 return this; 543 } 544 545 public enum QuirksMode { 546 noQuirks, quirks, limitedQuirks 547 } 548 549 public QuirksMode quirksMode() { 550 return quirksMode; 551 } 552 553 public Document quirksMode(QuirksMode quirksMode) { 554 this.quirksMode = quirksMode; 555 return this; 556 } 557 558 /** 559 * Get the parser that was used to parse this document. 560 * @return the parser 561 */ 562 public Parser parser() { 563 return parser; 564 } 565 566 /** 567 * Set the parser used to create this document. This parser is then used when further parsing within this document 568 * is required. 569 * @param parser the configured parser to use when further parsing is required for this document. 570 * @return this document, for chaining. 571 */ 572 public Document parser(Parser parser) { 573 this.parser = parser; 574 return this; 575 } 576 577 /** 578 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 579 made (e.g. when a form is submitted). 580 581 @param connection to set 582 @return this document, for chaining 583 @see Connection#newRequest() 584 @since 1.14.1 585 */ 586 public Document connection(Connection connection) { 587 Validate.notNull(connection); 588 this.connection = connection; 589 return this; 590 } 591}