001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.util.ArrayList; 019import java.util.List; 020 021import static org.jsoup.internal.StringUtil.inSorted; 022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 024import static org.jsoup.parser.Parser.*; 025 026/** 027 * HTML Tree Builder; creates a DOM from Tokens. 028 */ 029public class HtmlTreeBuilder extends TreeBuilder { 030 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 031 static final String[] TagsSearchInScope = new String[]{ // a particular element in scope 032 "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th" 033 }; 034 // math and svg namespaces for particular element in scope 035 static final String[]TagSearchInScopeMath = new String[] { 036 "annotation-xml", "mi", "mn", "mo", "ms", "mtext" 037 }; 038 static final String[]TagSearchInScopeSvg = new String[] { 039 "desc", "foreignObject", "title" 040 }; 041 042 static final String[] TagSearchList = new String[]{"ol", "ul"}; 043 static final String[] TagSearchButton = new String[]{"button"}; 044 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 045 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 046 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 047 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 048 static final String[] TagSearchSpecial = new String[]{ 049 "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", 050 "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", 051 "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", 052 "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", 053 "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", 054 "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td", 055 "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"}; 056 static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml 057 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 058 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 059 060 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 061 062 private HtmlTreeBuilderState state; // the current state 063 private HtmlTreeBuilderState originalState; // original / marked state 064 065 private boolean baseUriSetFromDoc; 066 private @Nullable Element headElement; // the current head element 067 private @Nullable FormElement formElement; // the current form element 068 private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing 069 ArrayList<Element> formattingElements; // active (open) formatting elements 070 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 071 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 072 private Token.EndTag emptyEnd; // reused empty end tag 073 074 private boolean framesetOk; // if ok to go into frameset 075 private boolean fosterInserts; // if next inserts should be fostered 076 private boolean fragmentParsing; // if parsing a fragment of html 077 078 @Override ParseSettings defaultSettings() { 079 return ParseSettings.htmlDefault; 080 } 081 082 @Override 083 HtmlTreeBuilder newInstance() { 084 return new HtmlTreeBuilder(); 085 } 086 087 @Override 088 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 089 super.initialiseParse(input, baseUri, parser); 090 091 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 092 state = HtmlTreeBuilderState.Initial; 093 originalState = null; 094 baseUriSetFromDoc = false; 095 headElement = null; 096 formElement = null; 097 contextElement = null; 098 formattingElements = new ArrayList<>(); 099 tmplInsertMode = new ArrayList<>(); 100 pendingTableCharacters = new ArrayList<>(); 101 emptyEnd = new Token.EndTag(this); 102 framesetOk = true; 103 fosterInserts = false; 104 fragmentParsing = false; 105 } 106 107 @Override void initialiseParseFragment(@Nullable Element context) { 108 // context may be null 109 state = HtmlTreeBuilderState.Initial; 110 fragmentParsing = true; 111 112 if (context != null) { 113 final String contextName = context.normalName(); 114 contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri); 115 if (context.ownerDocument() != null) // quirks setup: 116 doc.quirksMode(context.ownerDocument().quirksMode()); 117 118 // initialise the tokeniser state: 119 switch (contextName) { 120 case "title": 121 case "textarea": 122 tokeniser.transition(TokeniserState.Rcdata); 123 break; 124 case "iframe": 125 case "noembed": 126 case "noframes": 127 case "style": 128 case "xmp": 129 tokeniser.transition(TokeniserState.Rawtext); 130 break; 131 case "script": 132 tokeniser.transition(TokeniserState.ScriptData); 133 break; 134 case "plaintext": 135 tokeniser.transition(TokeniserState.PLAINTEXT); 136 break; 137 case "template": 138 tokeniser.transition(TokeniserState.Data); 139 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 140 break; 141 default: 142 tokeniser.transition(TokeniserState.Data); 143 } 144 doc.appendChild(contextElement); 145 push(contextElement); 146 resetInsertionMode(); 147 148 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 149 // with form correctly 150 Element formSearch = context; 151 while (formSearch != null) { 152 if (formSearch instanceof FormElement) { 153 formElement = (FormElement) formSearch; 154 break; 155 } 156 formSearch = formSearch.parent(); 157 } 158 } 159 } 160 161 @Override List<Node> completeParseFragment() { 162 if (contextElement != null) { 163 // depending on context and the input html, content may have been added outside of the root el 164 // e.g. context=p, input=div, the div will have been pushed out. 165 List<Node> nodes = contextElement.siblingNodes(); 166 if (!nodes.isEmpty()) 167 contextElement.insertChildren(-1, nodes); 168 return contextElement.childNodes(); 169 } 170 else 171 return doc.childNodes(); 172 } 173 174 @Override 175 protected boolean process(Token token) { 176 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 177 return dispatch.process(token, this); 178 } 179 180 boolean useCurrentOrForeignInsert(Token token) { 181 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 182 // If the stack of open elements is empty 183 if (stack.isEmpty()) 184 return true; 185 final Element el = currentElement(); 186 final String ns = el.tag().namespace(); 187 188 // If the adjusted current node is an element in the HTML namespace 189 if (NamespaceHtml.equals(ns)) 190 return true; 191 192 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 193 // If the adjusted current node is a MathML text integration point and the token is a character token 194 if (isMathmlTextIntegration(el)) { 195 if (token.isStartTag() 196 && !"mglyph".equals(token.asStartTag().normalName) 197 && !"malignmark".equals(token.asStartTag().normalName)) 198 return true; 199 if (token.isCharacter()) 200 return true; 201 } 202 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 203 if (Parser.NamespaceMathml.equals(ns) 204 && el.nameIs("annotation-xml") 205 && token.isStartTag() 206 && "svg".equals(token.asStartTag().normalName)) 207 return true; 208 209 // If the adjusted current node is an HTML integration point and the token is a start tag 210 // If the adjusted current node is an HTML integration point and the token is a character token 211 if (isHtmlIntegration(el) 212 && (token.isStartTag() || token.isCharacter())) 213 return true; 214 215 // If the token is an end-of-file token 216 return token.isEOF(); 217 } 218 219 static boolean isMathmlTextIntegration(Element el) { 220 /* 221 A node is a MathML text integration point if it is one of the following elements: 222 A MathML mi element 223 A MathML mo element 224 A MathML mn element 225 A MathML ms element 226 A MathML mtext element 227 */ 228 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 229 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 230 } 231 232 static boolean isHtmlIntegration(Element el) { 233 /* 234 A node is an HTML integration point if it is one of the following elements: 235 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 236 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 237 An SVG foreignObject element 238 An SVG desc element 239 An SVG title element 240 */ 241 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 242 && el.nameIs("annotation-xml")) { 243 String encoding = Normalizer.normalize(el.attr("encoding")); 244 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 245 return true; 246 } 247 // note using .tagName for case-sensitive hit here of foreignObject 248 return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration); 249 } 250 251 boolean process(Token token, HtmlTreeBuilderState state) { 252 return state.process(token, this); 253 } 254 255 void transition(HtmlTreeBuilderState state) { 256 this.state = state; 257 } 258 259 HtmlTreeBuilderState state() { 260 return state; 261 } 262 263 void markInsertionMode() { 264 originalState = state; 265 } 266 267 HtmlTreeBuilderState originalState() { 268 return originalState; 269 } 270 271 void framesetOk(boolean framesetOk) { 272 this.framesetOk = framesetOk; 273 } 274 275 boolean framesetOk() { 276 return framesetOk; 277 } 278 279 Document getDocument() { 280 return doc; 281 } 282 283 String getBaseUri() { 284 return baseUri; 285 } 286 287 void maybeSetBaseUri(Element base) { 288 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 289 return; 290 291 String href = base.absUrl("href"); 292 if (href.length() != 0) { // ignore <base target> etc 293 baseUri = href; 294 baseUriSetFromDoc = true; 295 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 296 } 297 } 298 299 boolean isFragmentParsing() { 300 return fragmentParsing; 301 } 302 303 void error(HtmlTreeBuilderState state) { 304 if (parser.getErrors().canAddError()) 305 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 306 currentToken.tokenType(), currentToken, state)); 307 } 308 309 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 310 // dedupe and normalize the attributes: 311 Attributes attributes = startTag.attributes; 312 if (!forcePreserveCase) 313 attributes = settings.normalizeAttributes(attributes); 314 if (attributes != null && !attributes.isEmpty()) { 315 int dupes = attributes.deduplicate(settings); 316 if (dupes > 0) { 317 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 318 } 319 } 320 321 Tag tag = tagFor(startTag.tagName, startTag.normalName, namespace, 322 forcePreserveCase ? ParseSettings.preserveCase : settings); 323 324 return (tag.normalName().equals("form")) ? 325 new FormElement(tag, null, attributes) : 326 new Element(tag, null, attributes); 327 } 328 329 /** Inserts an HTML element for the given tag) */ 330 Element insertElementFor(final Token.StartTag startTag) { 331 Element el = createElementFor(startTag, NamespaceHtml, false); 332 doInsertElement(el, startTag); 333 334 // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. 335 if (startTag.isSelfClosing()) { 336 Tag tag = el.tag(); 337 if (tag.isKnownTag()) { 338 if (!tag.isEmpty()) 339 tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName()); 340 // else: ok 341 } 342 else { // unknown tag: remember this is self-closing, for output 343 tag.setSelfClosing(); 344 } 345 346 // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 347 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 348 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 349 } 350 351 return el; 352 } 353 354 /** 355 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 356 */ 357 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 358 Element el = createElementFor(startTag, namespace, true); 359 doInsertElement(el, startTag); 360 361 if (startTag.isSelfClosing()) { 362 el.tag().setSelfClosing(); // remember this is self-closing for output 363 pop(); 364 } 365 366 return el; 367 } 368 369 Element insertEmptyElementFor(Token.StartTag startTag) { 370 Element el = createElementFor(startTag, NamespaceHtml, false); 371 doInsertElement(el, startTag); 372 pop(); 373 return el; 374 } 375 376 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 377 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 378 379 if (checkTemplateStack) { 380 if(!onStack("template")) 381 setFormElement(el); 382 } else 383 setFormElement(el); 384 385 doInsertElement(el, startTag); 386 if (!onStack) pop(); 387 return el; 388 } 389 390 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 391 tests on the Element before insertion. 392 * @param el the Element to insert and make the current element 393 * @param token the token this element was parsed from. If null, uses a zero-width current token as intrinsic insert 394 */ 395 private void doInsertElement(Element el, @Nullable Token token) { 396 if (el.tag().isFormListed() && formElement != null) 397 formElement.addElement(el); // connect form controls to their form element 398 399 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 400 if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 401 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 402 403 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 404 insertInFosterParent(el); 405 else 406 currentElement().appendChild(el); 407 408 push(el); 409 } 410 411 void insertCommentNode(Token.Comment token) { 412 Comment node = new Comment(token.getData()); 413 currentElement().appendChild(node); 414 onNodeInserted(node); 415 } 416 417 /** Inserts the provided character token into the current element. */ 418 void insertCharacterNode(Token.Character characterToken) { 419 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 420 insertCharacterToElement(characterToken, el); 421 } 422 423 /** Inserts the provided character token into the provided element. */ 424 void insertCharacterToElement(Token.Character characterToken, Element el) { 425 final Node node; 426 final String tagName = el.normalName(); 427 final String data = characterToken.getData(); 428 429 if (characterToken.isCData()) 430 node = new CDataNode(data); 431 else if (isContentForTagData(tagName)) 432 node = new DataNode(data); 433 else 434 node = new TextNode(data); 435 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 436 onNodeInserted(node); 437 } 438 439 ArrayList<Element> getStack() { 440 return stack; 441 } 442 443 boolean onStack(Element el) { 444 return onStack(stack, el); 445 } 446 447 /** Checks if there is an HTML element with the given name on the stack. */ 448 boolean onStack(String elName) { 449 return getFromStack(elName) != null; 450 } 451 452 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 453 private static boolean onStack(ArrayList<Element> queue, Element element) { 454 final int bottom = queue.size() - 1; 455 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 456 for (int pos = bottom; pos >= upper; pos--) { 457 Element next = queue.get(pos); 458 if (next == element) { 459 return true; 460 } 461 } 462 return false; 463 } 464 465 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 466 @Nullable 467 Element getFromStack(String elName) { 468 final int bottom = stack.size() - 1; 469 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 470 for (int pos = bottom; pos >= upper; pos--) { 471 Element next = stack.get(pos); 472 if (next.elementIs(elName, NamespaceHtml)) { 473 return next; 474 } 475 } 476 return null; 477 } 478 479 boolean removeFromStack(Element el) { 480 for (int pos = stack.size() -1; pos >= 0; pos--) { 481 Element next = stack.get(pos); 482 if (next == el) { 483 stack.remove(pos); 484 onNodeClosed(el); 485 return true; 486 } 487 } 488 return false; 489 } 490 491 /** Pops the stack until the given HTML element is removed. */ 492 @Nullable 493 Element popStackToClose(String elName) { 494 for (int pos = stack.size() -1; pos >= 0; pos--) { 495 Element el = pop(); 496 if (el.elementIs(elName, NamespaceHtml)) { 497 return el; 498 } 499 } 500 return null; 501 } 502 503 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 504 @Nullable 505 Element popStackToCloseAnyNamespace(String elName) { 506 for (int pos = stack.size() -1; pos >= 0; pos--) { 507 Element el = pop(); 508 if (el.nameIs(elName)) { 509 return el; 510 } 511 } 512 return null; 513 } 514 515 /** Pops the stack until one of the given HTML elements is removed. */ 516 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 517 for (int pos = stack.size() -1; pos >= 0; pos--) { 518 Element el = pop(); 519 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 520 break; 521 } 522 } 523 } 524 525 void clearStackToTableContext() { 526 clearStackToContext("table", "template"); 527 } 528 529 void clearStackToTableBodyContext() { 530 clearStackToContext("tbody", "tfoot", "thead", "template"); 531 } 532 533 void clearStackToTableRowContext() { 534 clearStackToContext("tr", "template"); 535 } 536 537 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 538 private void clearStackToContext(String... nodeNames) { 539 for (int pos = stack.size() -1; pos >= 0; pos--) { 540 Element next = stack.get(pos); 541 if (NamespaceHtml.equals(next.tag().namespace()) && 542 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 543 break; 544 else 545 pop(); 546 } 547 } 548 549 /** 550 Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be 551 its parent. 552 553 @param el 554 @return the Element immediately above the supplied element, or null if there is no such element. 555 */ 556 @Nullable Element aboveOnStack(Element el) { 557 assert onStack(el); 558 for (int pos = stack.size() -1; pos >= 0; pos--) { 559 Element next = stack.get(pos); 560 if (next == el) { 561 return stack.get(pos-1); 562 } 563 } 564 return null; 565 } 566 567 void insertOnStackAfter(Element after, Element in) { 568 int i = stack.lastIndexOf(after); 569 Validate.isTrue(i != -1); 570 stack.add(i+1, in); 571 } 572 573 void replaceOnStack(Element out, Element in) { 574 replaceInQueue(stack, out, in); 575 } 576 577 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 578 int i = queue.lastIndexOf(out); 579 Validate.isTrue(i != -1); 580 queue.set(i, in); 581 } 582 583 /** 584 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 585 * is limited to {@link #maxQueueDepth}. 586 * @return true if the insertion mode was actually changed. 587 */ 588 boolean resetInsertionMode() { 589 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 590 boolean last = false; 591 final int bottom = stack.size() - 1; 592 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 593 final HtmlTreeBuilderState origState = this.state; 594 595 if (stack.size() == 0) { // nothing left of stack, just get to body 596 transition(HtmlTreeBuilderState.InBody); 597 } 598 599 LOOP: for (int pos = bottom; pos >= upper; pos--) { 600 Element node = stack.get(pos); 601 if (pos == upper) { 602 last = true; 603 if (fragmentParsing) 604 node = contextElement; 605 } 606 String name = node != null ? node.normalName() : ""; 607 if (!NamespaceHtml.equals(node.tag().namespace())) 608 continue; // only looking for HTML elements here 609 610 switch (name) { 611 case "select": 612 transition(HtmlTreeBuilderState.InSelect); 613 // todo - should loop up (with some limit) and check for table or template hits 614 break LOOP; 615 case "td": 616 case "th": 617 if (!last) { 618 transition(HtmlTreeBuilderState.InCell); 619 break LOOP; 620 } 621 break; 622 case "tr": 623 transition(HtmlTreeBuilderState.InRow); 624 break LOOP; 625 case "tbody": 626 case "thead": 627 case "tfoot": 628 transition(HtmlTreeBuilderState.InTableBody); 629 break LOOP; 630 case "caption": 631 transition(HtmlTreeBuilderState.InCaption); 632 break LOOP; 633 case "colgroup": 634 transition(HtmlTreeBuilderState.InColumnGroup); 635 break LOOP; 636 case "table": 637 transition(HtmlTreeBuilderState.InTable); 638 break LOOP; 639 case "template": 640 HtmlTreeBuilderState tmplState = currentTemplateMode(); 641 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 642 transition(tmplState); 643 break LOOP; 644 case "head": 645 if (!last) { 646 transition(HtmlTreeBuilderState.InHead); 647 break LOOP; 648 } 649 break; 650 case "body": 651 transition(HtmlTreeBuilderState.InBody); 652 break LOOP; 653 case "frameset": 654 transition(HtmlTreeBuilderState.InFrameset); 655 break LOOP; 656 case "html": 657 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 658 break LOOP; 659 } 660 if (last) { 661 transition(HtmlTreeBuilderState.InBody); 662 break; 663 } 664 } 665 return state != origState; 666 } 667 668 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 669 void resetBody() { 670 if (!onStack("body")) { 671 stack.add(doc.body()); // not onNodeInserted, as already seen 672 } 673 transition(HtmlTreeBuilderState.InBody); 674 } 675 676 // todo: tidy up in specific scope methods 677 private final String[] specificScopeTarget = {null}; 678 679 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 680 specificScopeTarget[0] = targetName; 681 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 682 } 683 684 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 685 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 686 final int bottom = stack.size() -1; 687 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 688 // don't walk too far up the tree 689 for (int pos = bottom; pos >= top; pos--) { 690 Element el = stack.get(pos); 691 String elName = el.normalName(); 692 // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg: 693 String ns = el.tag().namespace(); 694 if (ns.equals(NamespaceHtml)) { 695 if (inSorted(elName, targetNames)) 696 return true; 697 if (inSorted(elName, baseTypes)) 698 return false; 699 if (extraTypes != null && inSorted(elName, extraTypes)) 700 return false; 701 } else if (baseTypes == TagsSearchInScope) { 702 if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath)) 703 return false; 704 if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg)) 705 return false; 706 } 707 } 708 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 709 return false; 710 } 711 712 boolean inScope(String[] targetNames) { 713 return inSpecificScope(targetNames, TagsSearchInScope, null); 714 } 715 716 boolean inScope(String targetName) { 717 return inScope(targetName, null); 718 } 719 720 boolean inScope(String targetName, String[] extras) { 721 return inSpecificScope(targetName, TagsSearchInScope, extras); 722 // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml 723 // todo: in svg namespace: forignOjbect, desc, title 724 } 725 726 boolean inListItemScope(String targetName) { 727 return inScope(targetName, TagSearchList); 728 } 729 730 boolean inButtonScope(String targetName) { 731 return inScope(targetName, TagSearchButton); 732 } 733 734 boolean inTableScope(String targetName) { 735 return inSpecificScope(targetName, TagSearchTableScope, null); 736 } 737 738 boolean inSelectScope(String targetName) { 739 for (int pos = stack.size() -1; pos >= 0; pos--) { 740 Element el = stack.get(pos); 741 String elName = el.normalName(); 742 if (elName.equals(targetName)) 743 return true; 744 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 745 return false; 746 } 747 Validate.fail("Should not be reachable"); 748 return false; 749 } 750 751 /** Tests if there is some element on the stack that is not in the provided set. */ 752 boolean onStackNot(String[] allowedTags) { 753 final int bottom = stack.size() -1; 754 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 755 // don't walk too far up the tree 756 757 for (int pos = bottom; pos >= top; pos--) { 758 final String elName = stack.get(pos).normalName(); 759 if (!inSorted(elName, allowedTags)) 760 return true; 761 } 762 return false; 763 } 764 765 void setHeadElement(Element headElement) { 766 this.headElement = headElement; 767 } 768 769 Element getHeadElement() { 770 return headElement; 771 } 772 773 boolean isFosterInserts() { 774 return fosterInserts; 775 } 776 777 void setFosterInserts(boolean fosterInserts) { 778 this.fosterInserts = fosterInserts; 779 } 780 781 @Nullable FormElement getFormElement() { 782 return formElement; 783 } 784 785 void setFormElement(FormElement formElement) { 786 this.formElement = formElement; 787 } 788 789 void resetPendingTableCharacters() { 790 pendingTableCharacters.clear(); 791 } 792 793 List<Token.Character> getPendingTableCharacters() { 794 return pendingTableCharacters; 795 } 796 797 void addPendingTableCharacters(Token.Character c) { 798 // make a clone of the token to maintain its state (as Tokens are otherwise reset) 799 Token.Character clone = c.clone(); 800 pendingTableCharacters.add(clone); 801 } 802 803 /** 804 13.2.6.3 Closing elements that have implied end tags 805 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 806 807 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 808 809 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 810 811 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 812 process, then the UA must perform the above steps as if that element was not in the above list. 813 */ 814 void generateImpliedEndTags(String excludeTag) { 815 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 816 if (excludeTag != null && currentElementIs(excludeTag)) 817 break; 818 pop(); 819 } 820 } 821 822 void generateImpliedEndTags() { 823 generateImpliedEndTags(false); 824 } 825 826 /** 827 Pops HTML elements off the stack according to the implied end tag rules 828 @param thorough if we are thorough (includes table elements etc) or not 829 */ 830 void generateImpliedEndTags(boolean thorough) { 831 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 832 while (NamespaceHtml.equals(currentElement().tag().namespace()) 833 && inSorted(currentElement().normalName(), search)) { 834 pop(); 835 } 836 } 837 838 void closeElement(String name) { 839 generateImpliedEndTags(name); 840 if (!name.equals(currentElement().normalName())) error(state()); 841 popStackToClose(name); 842 } 843 844 static boolean isSpecial(Element el) { 845 String namespace = el.tag().namespace(); 846 String name = el.normalName(); 847 switch (namespace) { 848 case NamespaceHtml: 849 return inSorted(name, TagSearchSpecial); 850 case Parser.NamespaceMathml: 851 return inSorted(name, TagSearchSpecialMath); 852 case Parser.NamespaceSvg: 853 return inSorted(name, TagSvgHtmlIntegration); 854 default: 855 return false; 856 } 857 } 858 859 Element lastFormattingElement() { 860 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 861 } 862 863 int positionOfElement(Element el){ 864 for (int i = 0; i < formattingElements.size(); i++){ 865 if (el == formattingElements.get(i)) 866 return i; 867 } 868 return -1; 869 } 870 871 Element removeLastFormattingElement() { 872 int size = formattingElements.size(); 873 if (size > 0) 874 return formattingElements.remove(size-1); 875 else 876 return null; 877 } 878 879 // active formatting elements 880 void pushActiveFormattingElements(Element in) { 881 checkActiveFormattingElements(in); 882 formattingElements.add(in); 883 } 884 885 void pushWithBookmark(Element in, int bookmark){ 886 checkActiveFormattingElements(in); 887 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 888 try { 889 formattingElements.add(bookmark, in); 890 } catch (IndexOutOfBoundsException e) { 891 formattingElements.add(in); 892 } 893 } 894 895 void checkActiveFormattingElements(Element in){ 896 int numSeen = 0; 897 final int size = formattingElements.size() -1; 898 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 899 900 for (int pos = size; pos >= ceil; pos--) { 901 Element el = formattingElements.get(pos); 902 if (el == null) // marker 903 break; 904 905 if (isSameFormattingElement(in, el)) 906 numSeen++; 907 908 if (numSeen == 3) { 909 formattingElements.remove(pos); 910 break; 911 } 912 } 913 } 914 915 private static boolean isSameFormattingElement(Element a, Element b) { 916 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 917 return a.normalName().equals(b.normalName()) && 918 // a.namespace().equals(b.namespace()) && 919 a.attributes().equals(b.attributes()); 920 // todo: namespaces 921 } 922 923 void reconstructFormattingElements() { 924 if (stack.size() > maxQueueDepth) 925 return; 926 Element last = lastFormattingElement(); 927 if (last == null || onStack(last)) 928 return; 929 930 Element entry = last; 931 int size = formattingElements.size(); 932 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 933 int pos = size - 1; 934 boolean skip = false; 935 while (true) { 936 if (pos == ceil) { // step 4. if none before, skip to 8 937 skip = true; 938 break; 939 } 940 entry = formattingElements.get(--pos); // step 5. one earlier than entry 941 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 942 break; // jump to 8, else continue back to 4 943 } 944 while(true) { 945 if (!skip) // step 7: on later than entry 946 entry = formattingElements.get(++pos); 947 Validate.notNull(entry); // should not occur, as we break at last element 948 949 // 8. create new element from element, 9 insert into current node, onto stack 950 skip = false; // can only skip increment from 4. 951 Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone()); 952 doInsertElement(newEl, null); 953 954 // 10. replace entry with new entry 955 formattingElements.set(pos, newEl); 956 957 // 11 958 if (pos == size-1) // if not last entry in list, jump to 7 959 break; 960 } 961 } 962 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 963 964 void clearFormattingElementsToLastMarker() { 965 while (!formattingElements.isEmpty()) { 966 Element el = removeLastFormattingElement(); 967 if (el == null) 968 break; 969 } 970 } 971 972 void removeFromActiveFormattingElements(Element el) { 973 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 974 Element next = formattingElements.get(pos); 975 if (next == el) { 976 formattingElements.remove(pos); 977 break; 978 } 979 } 980 } 981 982 boolean isInActiveFormattingElements(Element el) { 983 return onStack(formattingElements, el); 984 } 985 986 @Nullable 987 Element getActiveFormattingElement(String nodeName) { 988 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 989 Element next = formattingElements.get(pos); 990 if (next == null) // scope marker 991 break; 992 else if (next.nameIs(nodeName)) 993 return next; 994 } 995 return null; 996 } 997 998 void replaceActiveFormattingElement(Element out, Element in) { 999 replaceInQueue(formattingElements, out, in); 1000 } 1001 1002 void insertMarkerToFormattingElements() { 1003 formattingElements.add(null); 1004 } 1005 1006 void insertInFosterParent(Node in) { 1007 Element fosterParent; 1008 Element lastTable = getFromStack("table"); 1009 boolean isLastTableParent = false; 1010 if (lastTable != null) { 1011 if (lastTable.parent() != null) { 1012 fosterParent = lastTable.parent(); 1013 isLastTableParent = true; 1014 } else 1015 fosterParent = aboveOnStack(lastTable); 1016 } else { // no table == frag 1017 fosterParent = stack.get(0); 1018 } 1019 1020 if (isLastTableParent) { 1021 Validate.notNull(lastTable); // last table cannot be null by this point. 1022 lastTable.before(in); 1023 } 1024 else 1025 fosterParent.appendChild(in); 1026 } 1027 1028 // Template Insertion Mode stack 1029 void pushTemplateMode(HtmlTreeBuilderState state) { 1030 tmplInsertMode.add(state); 1031 } 1032 1033 @Nullable HtmlTreeBuilderState popTemplateMode() { 1034 if (tmplInsertMode.size() > 0) { 1035 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1036 } else { 1037 return null; 1038 } 1039 } 1040 1041 int templateModeSize() { 1042 return tmplInsertMode.size(); 1043 } 1044 1045 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1046 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1047 } 1048 1049 @Override 1050 public String toString() { 1051 return "TreeBuilder{" + 1052 "currentToken=" + currentToken + 1053 ", state=" + state + 1054 ", currentElement=" + currentElement() + 1055 '}'; 1056 } 1057 1058 @Override protected boolean isContentForTagData(final String normalName) { 1059 return (normalName.equals("script") || normalName.equals("style")); 1060 } 1061}