001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.util.ArrayList; 019import java.util.List; 020 021import static org.jsoup.internal.StringUtil.inSorted; 022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 024import static org.jsoup.parser.Parser.*; 025 026/** 027 * HTML Tree Builder; creates a DOM from Tokens. 028 */ 029public class HtmlTreeBuilder extends TreeBuilder { 030 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 031 static final String[] TagsSearchInScope = new String[]{ // a particular element in scope 032 "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th" 033 }; 034 // math and svg namespaces for particular element in scope 035 static final String[]TagSearchInScopeMath = new String[] { 036 "annotation-xml", "mi", "mn", "mo", "ms", "mtext" 037 }; 038 static final String[]TagSearchInScopeSvg = new String[] { 039 "desc", "foreignObject", "title" 040 }; 041 042 static final String[] TagSearchList = new String[]{"ol", "ul"}; 043 static final String[] TagSearchButton = new String[]{"button"}; 044 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 045 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 046 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 047 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 048 static final String[] TagSearchSpecial = new String[]{ 049 "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", 050 "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", 051 "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", 052 "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", 053 "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", 054 "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td", 055 "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"}; 056 static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml 057 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 058 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 059 static final String[] TagFormListed = { 060 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 061 }; 062 063 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 064 065 private HtmlTreeBuilderState state; // the current state 066 private HtmlTreeBuilderState originalState; // original / marked state 067 068 private boolean baseUriSetFromDoc; 069 private @Nullable Element headElement; // the current head element 070 private @Nullable FormElement formElement; // the current form element 071 private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing 072 ArrayList<Element> formattingElements; // active (open) formatting elements 073 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 074 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 075 private Token.EndTag emptyEnd; // reused empty end tag 076 077 private boolean framesetOk; // if ok to go into frameset 078 private boolean fosterInserts; // if next inserts should be fostered 079 private boolean fragmentParsing; // if parsing a fragment of html 080 081 @Override ParseSettings defaultSettings() { 082 return ParseSettings.htmlDefault; 083 } 084 085 @Override 086 HtmlTreeBuilder newInstance() { 087 return new HtmlTreeBuilder(); 088 } 089 090 @Override 091 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 092 super.initialiseParse(input, baseUri, parser); 093 094 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 095 state = HtmlTreeBuilderState.Initial; 096 originalState = null; 097 baseUriSetFromDoc = false; 098 headElement = null; 099 formElement = null; 100 contextElement = null; 101 formattingElements = new ArrayList<>(); 102 tmplInsertMode = new ArrayList<>(); 103 pendingTableCharacters = new ArrayList<>(); 104 emptyEnd = new Token.EndTag(this); 105 framesetOk = true; 106 fosterInserts = false; 107 fragmentParsing = false; 108 } 109 110 @Override void initialiseParseFragment(@Nullable Element context) { 111 // context may be null 112 state = HtmlTreeBuilderState.Initial; 113 fragmentParsing = true; 114 115 if (context != null) { 116 final String contextName = context.normalName(); 117 contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri); 118 if (context.ownerDocument() != null) // quirks setup: 119 doc.quirksMode(context.ownerDocument().quirksMode()); 120 121 // initialise the tokeniser state: 122 switch (contextName) { 123 case "script": 124 tokeniser.transition(TokeniserState.ScriptData); 125 break; 126 case "plaintext": 127 tokeniser.transition(TokeniserState.PLAINTEXT); 128 break; 129 case "template": 130 tokeniser.transition(TokeniserState.Data); 131 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 132 break; 133 default: 134 Tag tag = contextElement.tag(); 135 TokeniserState textState = tag.textState(); 136 if (textState != null) 137 tokeniser.transition(textState); // style, xmp, title, textarea, etc; or custom 138 else 139 tokeniser.transition(TokeniserState.Data); 140 } 141 doc.appendChild(contextElement); 142 push(contextElement); 143 resetInsertionMode(); 144 145 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 146 // with form correctly 147 Element formSearch = context; 148 while (formSearch != null) { 149 if (formSearch instanceof FormElement) { 150 formElement = (FormElement) formSearch; 151 break; 152 } 153 formSearch = formSearch.parent(); 154 } 155 } 156 } 157 158 @Override List<Node> completeParseFragment() { 159 if (contextElement != null) { 160 // depending on context and the input html, content may have been added outside of the root el 161 // e.g. context=p, input=div, the div will have been pushed out. 162 List<Node> nodes = contextElement.siblingNodes(); 163 if (!nodes.isEmpty()) 164 contextElement.insertChildren(-1, nodes); 165 return contextElement.childNodes(); 166 } 167 else 168 return doc.childNodes(); 169 } 170 171 @Override 172 protected boolean process(Token token) { 173 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 174 return dispatch.process(token, this); 175 } 176 177 boolean useCurrentOrForeignInsert(Token token) { 178 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 179 // If the stack of open elements is empty 180 if (stack.isEmpty()) 181 return true; 182 final Element el = currentElement(); 183 final String ns = el.tag().namespace(); 184 185 // If the adjusted current node is an element in the HTML namespace 186 if (NamespaceHtml.equals(ns)) 187 return true; 188 189 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 190 // If the adjusted current node is a MathML text integration point and the token is a character token 191 if (isMathmlTextIntegration(el)) { 192 if (token.isStartTag() 193 && !"mglyph".equals(token.asStartTag().normalName) 194 && !"malignmark".equals(token.asStartTag().normalName)) 195 return true; 196 if (token.isCharacter()) 197 return true; 198 } 199 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 200 if (Parser.NamespaceMathml.equals(ns) 201 && el.nameIs("annotation-xml") 202 && token.isStartTag() 203 && "svg".equals(token.asStartTag().normalName)) 204 return true; 205 206 // If the adjusted current node is an HTML integration point and the token is a start tag 207 // If the adjusted current node is an HTML integration point and the token is a character token 208 if (isHtmlIntegration(el) 209 && (token.isStartTag() || token.isCharacter())) 210 return true; 211 212 // If the token is an end-of-file token 213 return token.isEOF(); 214 } 215 216 static boolean isMathmlTextIntegration(Element el) { 217 /* 218 A node is a MathML text integration point if it is one of the following elements: 219 A MathML mi element 220 A MathML mo element 221 A MathML mn element 222 A MathML ms element 223 A MathML mtext element 224 */ 225 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 226 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 227 } 228 229 static boolean isHtmlIntegration(Element el) { 230 /* 231 A node is an HTML integration point if it is one of the following elements: 232 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 233 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 234 An SVG foreignObject element 235 An SVG desc element 236 An SVG title element 237 */ 238 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 239 && el.nameIs("annotation-xml")) { 240 String encoding = Normalizer.normalize(el.attr("encoding")); 241 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 242 return true; 243 } 244 // note using .tagName for case-sensitive hit here of foreignObject 245 return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration); 246 } 247 248 boolean process(Token token, HtmlTreeBuilderState state) { 249 return state.process(token, this); 250 } 251 252 void transition(HtmlTreeBuilderState state) { 253 this.state = state; 254 } 255 256 HtmlTreeBuilderState state() { 257 return state; 258 } 259 260 void markInsertionMode() { 261 originalState = state; 262 } 263 264 HtmlTreeBuilderState originalState() { 265 return originalState; 266 } 267 268 void framesetOk(boolean framesetOk) { 269 this.framesetOk = framesetOk; 270 } 271 272 boolean framesetOk() { 273 return framesetOk; 274 } 275 276 Document getDocument() { 277 return doc; 278 } 279 280 String getBaseUri() { 281 return baseUri; 282 } 283 284 void maybeSetBaseUri(Element base) { 285 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 286 return; 287 288 String href = base.absUrl("href"); 289 if (href.length() != 0) { // ignore <base target> etc 290 baseUri = href; 291 baseUriSetFromDoc = true; 292 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 293 } 294 } 295 296 boolean isFragmentParsing() { 297 return fragmentParsing; 298 } 299 300 void error(HtmlTreeBuilderState state) { 301 if (parser.getErrors().canAddError()) 302 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 303 currentToken.tokenType(), currentToken, state)); 304 } 305 306 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 307 // dedupe and normalize the attributes: 308 Attributes attributes = startTag.attributes; 309 if (!forcePreserveCase) 310 attributes = settings.normalizeAttributes(attributes); 311 if (attributes != null && !attributes.isEmpty()) { 312 int dupes = attributes.deduplicate(settings); 313 if (dupes > 0) { 314 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 315 } 316 } 317 318 Tag tag = tagFor(startTag.name(), startTag.normalName, namespace, 319 forcePreserveCase ? ParseSettings.preserveCase : settings); 320 321 return (tag.normalName().equals("form")) ? 322 new FormElement(tag, null, attributes) : 323 new Element(tag, null, attributes); 324 } 325 326 /** Inserts an HTML element for the given tag) */ 327 Element insertElementFor(final Token.StartTag startTag) { 328 Element el = createElementFor(startTag, NamespaceHtml, false); 329 doInsertElement(el); 330 331 // handle self-closing tags. when the spec expects an empty (void) tag, will directly hit insertEmpty, so won't generate this fake end tag. 332 if (startTag.isSelfClosing()) { 333 Tag tag = el.tag(); 334 tag.setSeenSelfClose(); // can infer output if in xml syntax 335 if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) { 336 // ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 337 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 338 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 339 } else { 340 // error it, and leave the inserted element on 341 tokeniser.error("Tag [%s] cannot be self-closing; not a void tag", tag.normalName()); 342 } 343 } 344 345 return el; 346 } 347 348 /** 349 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 350 */ 351 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 352 Element el = createElementFor(startTag, namespace, true); 353 doInsertElement(el); 354 355 if (startTag.isSelfClosing()) { // foreign els are OK to self-close 356 el.tag().setSeenSelfClose(); // remember this is self-closing for output 357 pop(); 358 } 359 360 return el; 361 } 362 363 Element insertEmptyElementFor(Token.StartTag startTag) { 364 Element el = createElementFor(startTag, NamespaceHtml, false); 365 doInsertElement(el); 366 pop(); 367 return el; 368 } 369 370 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 371 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 372 373 if (checkTemplateStack) { 374 if(!onStack("template")) 375 setFormElement(el); 376 } else 377 setFormElement(el); 378 379 doInsertElement(el); 380 if (!onStack) pop(); 381 return el; 382 } 383 384 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 385 tests on the Element before insertion. 386 * @param el the Element to insert and make the current element 387 */ 388 private void doInsertElement(Element el) { 389 if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed)) 390 formElement.addElement(el); // connect form controls to their form element 391 392 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 393 if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 394 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 395 396 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 397 insertInFosterParent(el); 398 else 399 currentElement().appendChild(el); 400 401 push(el); 402 } 403 404 void insertCommentNode(Token.Comment token) { 405 Comment node = new Comment(token.getData()); 406 currentElement().appendChild(node); 407 onNodeInserted(node); 408 } 409 410 /** Inserts the provided character token into the current element. */ 411 void insertCharacterNode(Token.Character characterToken) { 412 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 413 insertCharacterToElement(characterToken, el); 414 } 415 416 /** Inserts the provided character token into the provided element. */ 417 void insertCharacterToElement(Token.Character characterToken, Element el) { 418 final Node node; 419 final String data = characterToken.getData(); 420 421 if (characterToken.isCData()) 422 node = new CDataNode(data); 423 else if (el.tag().is(Tag.Data)) 424 node = new DataNode(data); 425 else 426 node = new TextNode(data); 427 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 428 onNodeInserted(node); 429 } 430 431 ArrayList<Element> getStack() { 432 return stack; 433 } 434 435 boolean onStack(Element el) { 436 return onStack(stack, el); 437 } 438 439 /** Checks if there is an HTML element with the given name on the stack. */ 440 boolean onStack(String elName) { 441 return getFromStack(elName) != null; 442 } 443 444 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 445 private static boolean onStack(ArrayList<Element> queue, Element element) { 446 final int bottom = queue.size() - 1; 447 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 448 for (int pos = bottom; pos >= upper; pos--) { 449 Element next = queue.get(pos); 450 if (next == element) { 451 return true; 452 } 453 } 454 return false; 455 } 456 457 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 458 @Nullable 459 Element getFromStack(String elName) { 460 final int bottom = stack.size() - 1; 461 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 462 for (int pos = bottom; pos >= upper; pos--) { 463 Element next = stack.get(pos); 464 if (next.elementIs(elName, NamespaceHtml)) { 465 return next; 466 } 467 } 468 return null; 469 } 470 471 boolean removeFromStack(Element el) { 472 for (int pos = stack.size() -1; pos >= 0; pos--) { 473 Element next = stack.get(pos); 474 if (next == el) { 475 stack.remove(pos); 476 onNodeClosed(el); 477 return true; 478 } 479 } 480 return false; 481 } 482 483 /** Pops the stack until the given HTML element is removed. */ 484 @Nullable 485 Element popStackToClose(String elName) { 486 for (int pos = stack.size() -1; pos >= 0; pos--) { 487 Element el = pop(); 488 if (el.elementIs(elName, NamespaceHtml)) { 489 return el; 490 } 491 } 492 return null; 493 } 494 495 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 496 @Nullable 497 Element popStackToCloseAnyNamespace(String elName) { 498 for (int pos = stack.size() -1; pos >= 0; pos--) { 499 Element el = pop(); 500 if (el.nameIs(elName)) { 501 return el; 502 } 503 } 504 return null; 505 } 506 507 /** Pops the stack until one of the given HTML elements is removed. */ 508 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 509 for (int pos = stack.size() -1; pos >= 0; pos--) { 510 Element el = pop(); 511 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 512 break; 513 } 514 } 515 } 516 517 void clearStackToTableContext() { 518 clearStackToContext("table", "template"); 519 } 520 521 void clearStackToTableBodyContext() { 522 clearStackToContext("tbody", "tfoot", "thead", "template"); 523 } 524 525 void clearStackToTableRowContext() { 526 clearStackToContext("tr", "template"); 527 } 528 529 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 530 private void clearStackToContext(String... nodeNames) { 531 for (int pos = stack.size() -1; pos >= 0; pos--) { 532 Element next = stack.get(pos); 533 if (NamespaceHtml.equals(next.tag().namespace()) && 534 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 535 break; 536 else 537 pop(); 538 } 539 } 540 541 /** 542 Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be 543 its parent. 544 545 @param el 546 @return the Element immediately above the supplied element, or null if there is no such element. 547 */ 548 @Nullable Element aboveOnStack(Element el) { 549 assert onStack(el); 550 for (int pos = stack.size() -1; pos >= 0; pos--) { 551 Element next = stack.get(pos); 552 if (next == el) { 553 return stack.get(pos-1); 554 } 555 } 556 return null; 557 } 558 559 void insertOnStackAfter(Element after, Element in) { 560 int i = stack.lastIndexOf(after); 561 Validate.isTrue(i != -1); 562 stack.add(i+1, in); 563 } 564 565 void replaceOnStack(Element out, Element in) { 566 replaceInQueue(stack, out, in); 567 } 568 569 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 570 int i = queue.lastIndexOf(out); 571 Validate.isTrue(i != -1); 572 queue.set(i, in); 573 } 574 575 /** 576 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 577 * is limited to {@link #maxQueueDepth}. 578 * @return true if the insertion mode was actually changed. 579 */ 580 boolean resetInsertionMode() { 581 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 582 boolean last = false; 583 final int bottom = stack.size() - 1; 584 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 585 final HtmlTreeBuilderState origState = this.state; 586 587 if (stack.size() == 0) { // nothing left of stack, just get to body 588 transition(HtmlTreeBuilderState.InBody); 589 } 590 591 LOOP: for (int pos = bottom; pos >= upper; pos--) { 592 Element node = stack.get(pos); 593 if (pos == upper) { 594 last = true; 595 if (fragmentParsing) 596 node = contextElement; 597 } 598 String name = node != null ? node.normalName() : ""; 599 if (!NamespaceHtml.equals(node.tag().namespace())) 600 continue; // only looking for HTML elements here 601 602 switch (name) { 603 case "select": 604 transition(HtmlTreeBuilderState.InSelect); 605 // todo - should loop up (with some limit) and check for table or template hits 606 break LOOP; 607 case "td": 608 case "th": 609 if (!last) { 610 transition(HtmlTreeBuilderState.InCell); 611 break LOOP; 612 } 613 break; 614 case "tr": 615 transition(HtmlTreeBuilderState.InRow); 616 break LOOP; 617 case "tbody": 618 case "thead": 619 case "tfoot": 620 transition(HtmlTreeBuilderState.InTableBody); 621 break LOOP; 622 case "caption": 623 transition(HtmlTreeBuilderState.InCaption); 624 break LOOP; 625 case "colgroup": 626 transition(HtmlTreeBuilderState.InColumnGroup); 627 break LOOP; 628 case "table": 629 transition(HtmlTreeBuilderState.InTable); 630 break LOOP; 631 case "template": 632 HtmlTreeBuilderState tmplState = currentTemplateMode(); 633 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 634 transition(tmplState); 635 break LOOP; 636 case "head": 637 if (!last) { 638 transition(HtmlTreeBuilderState.InHead); 639 break LOOP; 640 } 641 break; 642 case "body": 643 transition(HtmlTreeBuilderState.InBody); 644 break LOOP; 645 case "frameset": 646 transition(HtmlTreeBuilderState.InFrameset); 647 break LOOP; 648 case "html": 649 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 650 break LOOP; 651 } 652 if (last) { 653 transition(HtmlTreeBuilderState.InBody); 654 break; 655 } 656 } 657 return state != origState; 658 } 659 660 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 661 void resetBody() { 662 if (!onStack("body")) { 663 stack.add(doc.body()); // not onNodeInserted, as already seen 664 } 665 transition(HtmlTreeBuilderState.InBody); 666 } 667 668 // todo: tidy up in specific scope methods 669 private final String[] specificScopeTarget = {null}; 670 671 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 672 specificScopeTarget[0] = targetName; 673 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 674 } 675 676 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 677 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 678 final int bottom = stack.size() -1; 679 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 680 // don't walk too far up the tree 681 for (int pos = bottom; pos >= top; pos--) { 682 Element el = stack.get(pos); 683 String elName = el.normalName(); 684 // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg: 685 String ns = el.tag().namespace(); 686 if (ns.equals(NamespaceHtml)) { 687 if (inSorted(elName, targetNames)) 688 return true; 689 if (inSorted(elName, baseTypes)) 690 return false; 691 if (extraTypes != null && inSorted(elName, extraTypes)) 692 return false; 693 } else if (baseTypes == TagsSearchInScope) { 694 if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath)) 695 return false; 696 if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg)) 697 return false; 698 } 699 } 700 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 701 return false; 702 } 703 704 boolean inScope(String[] targetNames) { 705 return inSpecificScope(targetNames, TagsSearchInScope, null); 706 } 707 708 boolean inScope(String targetName) { 709 return inScope(targetName, null); 710 } 711 712 boolean inScope(String targetName, String[] extras) { 713 return inSpecificScope(targetName, TagsSearchInScope, extras); 714 // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml 715 // todo: in svg namespace: forignOjbect, desc, title 716 } 717 718 boolean inListItemScope(String targetName) { 719 return inScope(targetName, TagSearchList); 720 } 721 722 boolean inButtonScope(String targetName) { 723 return inScope(targetName, TagSearchButton); 724 } 725 726 boolean inTableScope(String targetName) { 727 return inSpecificScope(targetName, TagSearchTableScope, null); 728 } 729 730 boolean inSelectScope(String targetName) { 731 for (int pos = stack.size() -1; pos >= 0; pos--) { 732 Element el = stack.get(pos); 733 String elName = el.normalName(); 734 if (elName.equals(targetName)) 735 return true; 736 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 737 return false; 738 } 739 Validate.fail("Should not be reachable"); 740 return false; 741 } 742 743 /** Tests if there is some element on the stack that is not in the provided set. */ 744 boolean onStackNot(String[] allowedTags) { 745 final int bottom = stack.size() -1; 746 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 747 // don't walk too far up the tree 748 749 for (int pos = bottom; pos >= top; pos--) { 750 final String elName = stack.get(pos).normalName(); 751 if (!inSorted(elName, allowedTags)) 752 return true; 753 } 754 return false; 755 } 756 757 void setHeadElement(Element headElement) { 758 this.headElement = headElement; 759 } 760 761 Element getHeadElement() { 762 return headElement; 763 } 764 765 boolean isFosterInserts() { 766 return fosterInserts; 767 } 768 769 void setFosterInserts(boolean fosterInserts) { 770 this.fosterInserts = fosterInserts; 771 } 772 773 @Nullable FormElement getFormElement() { 774 return formElement; 775 } 776 777 void setFormElement(FormElement formElement) { 778 this.formElement = formElement; 779 } 780 781 void resetPendingTableCharacters() { 782 pendingTableCharacters.clear(); 783 } 784 785 List<Token.Character> getPendingTableCharacters() { 786 return pendingTableCharacters; 787 } 788 789 void addPendingTableCharacters(Token.Character c) { 790 // make a copy of the token to maintain its state (as Tokens are otherwise reset) 791 Token.Character copy = new Token.Character(c); 792 pendingTableCharacters.add(copy); 793 } 794 795 /** 796 13.2.6.3 Closing elements that have implied end tags 797 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 798 799 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 800 801 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 802 803 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 804 process, then the UA must perform the above steps as if that element was not in the above list. 805 */ 806 void generateImpliedEndTags(String excludeTag) { 807 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 808 if (excludeTag != null && currentElementIs(excludeTag)) 809 break; 810 pop(); 811 } 812 } 813 814 void generateImpliedEndTags() { 815 generateImpliedEndTags(false); 816 } 817 818 /** 819 Pops HTML elements off the stack according to the implied end tag rules 820 @param thorough if we are thorough (includes table elements etc) or not 821 */ 822 void generateImpliedEndTags(boolean thorough) { 823 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 824 while (NamespaceHtml.equals(currentElement().tag().namespace()) 825 && inSorted(currentElement().normalName(), search)) { 826 pop(); 827 } 828 } 829 830 void closeElement(String name) { 831 generateImpliedEndTags(name); 832 if (!name.equals(currentElement().normalName())) error(state()); 833 popStackToClose(name); 834 } 835 836 static boolean isSpecial(Element el) { 837 String namespace = el.tag().namespace(); 838 String name = el.normalName(); 839 switch (namespace) { 840 case NamespaceHtml: 841 return inSorted(name, TagSearchSpecial); 842 case Parser.NamespaceMathml: 843 return inSorted(name, TagSearchSpecialMath); 844 case Parser.NamespaceSvg: 845 return inSorted(name, TagSvgHtmlIntegration); 846 default: 847 return false; 848 } 849 } 850 851 Element lastFormattingElement() { 852 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 853 } 854 855 int positionOfElement(Element el){ 856 for (int i = 0; i < formattingElements.size(); i++){ 857 if (el == formattingElements.get(i)) 858 return i; 859 } 860 return -1; 861 } 862 863 Element removeLastFormattingElement() { 864 int size = formattingElements.size(); 865 if (size > 0) 866 return formattingElements.remove(size-1); 867 else 868 return null; 869 } 870 871 // active formatting elements 872 void pushActiveFormattingElements(Element in) { 873 checkActiveFormattingElements(in); 874 formattingElements.add(in); 875 } 876 877 void pushWithBookmark(Element in, int bookmark){ 878 checkActiveFormattingElements(in); 879 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 880 try { 881 formattingElements.add(bookmark, in); 882 } catch (IndexOutOfBoundsException e) { 883 formattingElements.add(in); 884 } 885 } 886 887 void checkActiveFormattingElements(Element in){ 888 int numSeen = 0; 889 final int size = formattingElements.size() -1; 890 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 891 892 for (int pos = size; pos >= ceil; pos--) { 893 Element el = formattingElements.get(pos); 894 if (el == null) // marker 895 break; 896 897 if (isSameFormattingElement(in, el)) 898 numSeen++; 899 900 if (numSeen == 3) { 901 formattingElements.remove(pos); 902 break; 903 } 904 } 905 } 906 907 private static boolean isSameFormattingElement(Element a, Element b) { 908 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 909 return a.normalName().equals(b.normalName()) && 910 // a.namespace().equals(b.namespace()) && 911 a.attributes().equals(b.attributes()); 912 // todo: namespaces 913 } 914 915 void reconstructFormattingElements() { 916 if (stack.size() > maxQueueDepth) 917 return; 918 Element last = lastFormattingElement(); 919 if (last == null || onStack(last)) 920 return; 921 922 Element entry = last; 923 int size = formattingElements.size(); 924 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 925 int pos = size - 1; 926 boolean skip = false; 927 while (true) { 928 if (pos == ceil) { // step 4. if none before, skip to 8 929 skip = true; 930 break; 931 } 932 entry = formattingElements.get(--pos); // step 5. one earlier than entry 933 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 934 break; // jump to 8, else continue back to 4 935 } 936 while(true) { 937 if (!skip) // step 7: on later than entry 938 entry = formattingElements.get(++pos); 939 Validate.notNull(entry); // should not occur, as we break at last element 940 941 // 8. create new element from element, 9 insert into current node, onto stack 942 skip = false; // can only skip increment from 4. 943 Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone()); 944 doInsertElement(newEl); 945 946 // 10. replace entry with new entry 947 formattingElements.set(pos, newEl); 948 949 // 11 950 if (pos == size-1) // if not last entry in list, jump to 7 951 break; 952 } 953 } 954 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 955 956 void clearFormattingElementsToLastMarker() { 957 while (!formattingElements.isEmpty()) { 958 Element el = removeLastFormattingElement(); 959 if (el == null) 960 break; 961 } 962 } 963 964 void removeFromActiveFormattingElements(Element el) { 965 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 966 Element next = formattingElements.get(pos); 967 if (next == el) { 968 formattingElements.remove(pos); 969 break; 970 } 971 } 972 } 973 974 boolean isInActiveFormattingElements(Element el) { 975 return onStack(formattingElements, el); 976 } 977 978 @Nullable 979 Element getActiveFormattingElement(String nodeName) { 980 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 981 Element next = formattingElements.get(pos); 982 if (next == null) // scope marker 983 break; 984 else if (next.nameIs(nodeName)) 985 return next; 986 } 987 return null; 988 } 989 990 void replaceActiveFormattingElement(Element out, Element in) { 991 replaceInQueue(formattingElements, out, in); 992 } 993 994 void insertMarkerToFormattingElements() { 995 formattingElements.add(null); 996 } 997 998 void insertInFosterParent(Node in) { 999 Element fosterParent; 1000 Element lastTable = getFromStack("table"); 1001 boolean isLastTableParent = false; 1002 if (lastTable != null) { 1003 if (lastTable.parent() != null) { 1004 fosterParent = lastTable.parent(); 1005 isLastTableParent = true; 1006 } else 1007 fosterParent = aboveOnStack(lastTable); 1008 } else { // no table == frag 1009 fosterParent = stack.get(0); 1010 } 1011 1012 if (isLastTableParent) { 1013 Validate.notNull(lastTable); // last table cannot be null by this point. 1014 lastTable.before(in); 1015 } 1016 else 1017 fosterParent.appendChild(in); 1018 } 1019 1020 // Template Insertion Mode stack 1021 void pushTemplateMode(HtmlTreeBuilderState state) { 1022 tmplInsertMode.add(state); 1023 } 1024 1025 @Nullable HtmlTreeBuilderState popTemplateMode() { 1026 if (tmplInsertMode.size() > 0) { 1027 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1028 } else { 1029 return null; 1030 } 1031 } 1032 1033 int templateModeSize() { 1034 return tmplInsertMode.size(); 1035 } 1036 1037 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1038 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1039 } 1040 1041 @Override 1042 public String toString() { 1043 return "TreeBuilder{" + 1044 "currentToken=" + currentToken + 1045 ", state=" + state + 1046 ", currentElement=" + currentElement() + 1047 '}'; 1048 } 1049 1050 /** @deprecated this unused internal method will be removed. */ 1051 @Deprecated 1052 protected boolean isContentForTagData(final String normalName) { 1053 return (normalName.equals("script") || normalName.equals("style")); 1054 } 1055 1056}