001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.util.ArrayList; 019import java.util.List; 020 021import static org.jsoup.internal.StringUtil.inSorted; 022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 024import static org.jsoup.parser.Parser.*; 025 026/** 027 * HTML Tree Builder; creates a DOM from Tokens. 028 */ 029public class HtmlTreeBuilder extends TreeBuilder { 030 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 031 static final String[] TagsSearchInScope = new String[]{ // a particular element in scope 032 "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th" 033 }; 034 // math and svg namespaces for particular element in scope 035 static final String[]TagSearchInScopeMath = new String[] { 036 "annotation-xml", "mi", "mn", "mo", "ms", "mtext" 037 }; 038 static final String[]TagSearchInScopeSvg = new String[] { 039 "desc", "foreignObject", "title" 040 }; 041 042 static final String[] TagSearchList = new String[]{"ol", "ul"}; 043 static final String[] TagSearchButton = new String[]{"button"}; 044 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 045 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 046 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 047 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 048 static final String[] TagSearchSpecial = new String[]{ 049 "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", 050 "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed", 051 "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", 052 "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main", 053 "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", 054 "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td", 055 "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"}; 056 static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml 057 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 058 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 059 static final String[] TagFormListed = { 060 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 061 }; 062 063 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 064 065 private HtmlTreeBuilderState state; // the current state 066 private HtmlTreeBuilderState originalState; // original / marked state 067 068 private boolean baseUriSetFromDoc; 069 private @Nullable Element headElement; // the current head element 070 private @Nullable FormElement formElement; // the current form element 071 private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing 072 ArrayList<Element> formattingElements; // active (open) formatting elements 073 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 074 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 075 private Token.EndTag emptyEnd; // reused empty end tag 076 077 private boolean framesetOk; // if ok to go into frameset 078 private boolean fosterInserts; // if next inserts should be fostered 079 private boolean fragmentParsing; // if parsing a fragment of html 080 081 @Override ParseSettings defaultSettings() { 082 return ParseSettings.htmlDefault; 083 } 084 085 @Override 086 HtmlTreeBuilder newInstance() { 087 return new HtmlTreeBuilder(); 088 } 089 090 @Override 091 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 092 super.initialiseParse(input, baseUri, parser); 093 094 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 095 state = HtmlTreeBuilderState.Initial; 096 originalState = null; 097 baseUriSetFromDoc = false; 098 headElement = null; 099 formElement = null; 100 contextElement = null; 101 formattingElements = new ArrayList<>(); 102 tmplInsertMode = new ArrayList<>(); 103 pendingTableCharacters = new ArrayList<>(); 104 emptyEnd = new Token.EndTag(this); 105 framesetOk = true; 106 fosterInserts = false; 107 fragmentParsing = false; 108 } 109 110 @Override void initialiseParseFragment(@Nullable Element context) { 111 // context may be null 112 state = HtmlTreeBuilderState.Initial; 113 fragmentParsing = true; 114 115 if (context != null) { 116 final String contextName = context.normalName(); 117 contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri); 118 if (context.ownerDocument() != null) // quirks setup: 119 doc.quirksMode(context.ownerDocument().quirksMode()); 120 121 // initialise the tokeniser state: 122 switch (contextName) { 123 case "script": 124 tokeniser.transition(TokeniserState.ScriptData); 125 break; 126 case "plaintext": 127 tokeniser.transition(TokeniserState.PLAINTEXT); 128 break; 129 case "template": 130 tokeniser.transition(TokeniserState.Data); 131 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 132 break; 133 default: 134 Tag tag = contextElement.tag(); 135 TokeniserState textState = tag.textState(); 136 if (textState != null) 137 tokeniser.transition(textState); // style, xmp, title, textarea, etc; or custom 138 else 139 tokeniser.transition(TokeniserState.Data); 140 } 141 doc.appendChild(contextElement); 142 push(contextElement); 143 resetInsertionMode(); 144 145 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 146 // with form correctly 147 Element formSearch = context; 148 while (formSearch != null) { 149 if (formSearch instanceof FormElement) { 150 formElement = (FormElement) formSearch; 151 break; 152 } 153 formSearch = formSearch.parent(); 154 } 155 } 156 } 157 158 @Override List<Node> completeParseFragment() { 159 if (contextElement != null) { 160 // depending on context and the input html, content may have been added outside of the root el 161 // e.g. context=p, input=div, the div will have been pushed out. 162 List<Node> nodes = contextElement.siblingNodes(); 163 if (!nodes.isEmpty()) 164 contextElement.insertChildren(-1, nodes); 165 return contextElement.childNodes(); 166 } 167 else 168 return doc.childNodes(); 169 } 170 171 @Override 172 protected boolean process(Token token) { 173 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 174 return dispatch.process(token, this); 175 } 176 177 boolean useCurrentOrForeignInsert(Token token) { 178 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 179 // If the stack of open elements is empty 180 if (stack.isEmpty()) 181 return true; 182 final Element el = currentElement(); 183 final String ns = el.tag().namespace(); 184 185 // If the adjusted current node is an element in the HTML namespace 186 if (NamespaceHtml.equals(ns)) 187 return true; 188 189 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 190 // If the adjusted current node is a MathML text integration point and the token is a character token 191 if (isMathmlTextIntegration(el)) { 192 if (token.isStartTag() 193 && !"mglyph".equals(token.asStartTag().normalName) 194 && !"malignmark".equals(token.asStartTag().normalName)) 195 return true; 196 if (token.isCharacter()) 197 return true; 198 } 199 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 200 if (Parser.NamespaceMathml.equals(ns) 201 && el.nameIs("annotation-xml") 202 && token.isStartTag() 203 && "svg".equals(token.asStartTag().normalName)) 204 return true; 205 206 // If the adjusted current node is an HTML integration point and the token is a start tag 207 // If the adjusted current node is an HTML integration point and the token is a character token 208 if (isHtmlIntegration(el) 209 && (token.isStartTag() || token.isCharacter())) 210 return true; 211 212 // If the token is an end-of-file token 213 return token.isEOF(); 214 } 215 216 static boolean isMathmlTextIntegration(Element el) { 217 /* 218 A node is a MathML text integration point if it is one of the following elements: 219 A MathML mi element 220 A MathML mo element 221 A MathML mn element 222 A MathML ms element 223 A MathML mtext element 224 */ 225 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 226 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 227 } 228 229 static boolean isHtmlIntegration(Element el) { 230 /* 231 A node is an HTML integration point if it is one of the following elements: 232 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 233 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 234 An SVG foreignObject element 235 An SVG desc element 236 An SVG title element 237 */ 238 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 239 && el.nameIs("annotation-xml")) { 240 String encoding = Normalizer.normalize(el.attr("encoding")); 241 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 242 return true; 243 } 244 // note using .tagName for case-sensitive hit here of foreignObject 245 return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration); 246 } 247 248 boolean process(Token token, HtmlTreeBuilderState state) { 249 return state.process(token, this); 250 } 251 252 void transition(HtmlTreeBuilderState state) { 253 this.state = state; 254 } 255 256 HtmlTreeBuilderState state() { 257 return state; 258 } 259 260 void markInsertionMode() { 261 originalState = state; 262 } 263 264 HtmlTreeBuilderState originalState() { 265 return originalState; 266 } 267 268 void framesetOk(boolean framesetOk) { 269 this.framesetOk = framesetOk; 270 } 271 272 boolean framesetOk() { 273 return framesetOk; 274 } 275 276 Document getDocument() { 277 return doc; 278 } 279 280 String getBaseUri() { 281 return baseUri; 282 } 283 284 void maybeSetBaseUri(Element base) { 285 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 286 return; 287 288 String href = base.absUrl("href"); 289 if (href.length() != 0) { // ignore <base target> etc 290 baseUri = href; 291 baseUriSetFromDoc = true; 292 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 293 } 294 } 295 296 boolean isFragmentParsing() { 297 return fragmentParsing; 298 } 299 300 void error(HtmlTreeBuilderState state) { 301 if (parser.getErrors().canAddError()) 302 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 303 currentToken.tokenType(), currentToken, state)); 304 } 305 306 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 307 // dedupe and normalize the attributes: 308 Attributes attributes = startTag.attributes; 309 if (!forcePreserveCase) 310 attributes = settings.normalizeAttributes(attributes); 311 if (attributes != null && !attributes.isEmpty()) { 312 int dupes = attributes.deduplicate(settings); 313 if (dupes > 0) { 314 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 315 } 316 } 317 318 Tag tag = tagFor(startTag.name(), startTag.normalName, namespace, 319 forcePreserveCase ? ParseSettings.preserveCase : settings); 320 321 return (tag.normalName().equals("form")) ? 322 new FormElement(tag, null, attributes) : 323 new Element(tag, null, attributes); 324 } 325 326 /** Inserts an HTML element for the given tag */ 327 Element insertElementFor(final Token.StartTag startTag) { 328 Element el = createElementFor(startTag, NamespaceHtml, false); 329 doInsertElement(el); 330 331 // handle self-closing tags. when the spec expects an empty (void) tag, will directly hit insertEmpty, so won't generate this fake end tag. 332 if (startTag.isSelfClosing()) { 333 Tag tag = el.tag(); 334 tag.setSeenSelfClose(); // can infer output if in xml syntax 335 if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) { 336 // ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 337 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 338 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 339 } else { 340 // error it, and leave the inserted element on 341 tokeniser.error("Tag [%s] cannot be self-closing; not a void tag", tag.normalName()); 342 } 343 } 344 345 return el; 346 } 347 348 /** 349 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 350 */ 351 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 352 Element el = createElementFor(startTag, namespace, true); 353 doInsertElement(el); 354 355 if (startTag.isSelfClosing()) { // foreign els are OK to self-close 356 el.tag().setSeenSelfClose(); // remember this is self-closing for output 357 pop(); 358 } 359 360 return el; 361 } 362 363 Element insertEmptyElementFor(Token.StartTag startTag) { 364 Element el = createElementFor(startTag, NamespaceHtml, false); 365 doInsertElement(el); 366 pop(); 367 return el; 368 } 369 370 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 371 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 372 373 if (checkTemplateStack) { 374 if(!onStack("template")) 375 setFormElement(el); 376 } else 377 setFormElement(el); 378 379 doInsertElement(el); 380 if (!onStack) pop(); 381 return el; 382 } 383 384 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 385 tests on the Element before insertion. 386 * @param el the Element to insert and make the current element 387 */ 388 private void doInsertElement(Element el) { 389 if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed)) 390 formElement.addElement(el); // connect form controls to their form element 391 392 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 393 if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 394 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 395 396 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 397 insertInFosterParent(el); 398 else 399 currentElement().appendChild(el); 400 401 push(el); 402 } 403 404 void insertCommentNode(Token.Comment token) { 405 Comment node = new Comment(token.getData()); 406 currentElement().appendChild(node); 407 onNodeInserted(node); 408 } 409 410 /** Inserts the provided character token into the current element. */ 411 void insertCharacterNode(Token.Character characterToken) { 412 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 413 insertCharacterToElement(characterToken, el); 414 } 415 416 /** Inserts the provided character token into the provided element. */ 417 void insertCharacterToElement(Token.Character characterToken, Element el) { 418 final Node node; 419 final String data = characterToken.getData(); 420 421 if (characterToken.isCData()) 422 node = new CDataNode(data); 423 else if (el.tag().is(Tag.Data)) 424 node = new DataNode(data); 425 else 426 node = new TextNode(data); 427 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 428 onNodeInserted(node); 429 } 430 431 ArrayList<Element> getStack() { 432 return stack; 433 } 434 435 boolean onStack(Element el) { 436 return onStack(stack, el); 437 } 438 439 /** Checks if there is an HTML element with the given name on the stack. */ 440 boolean onStack(String elName) { 441 return getFromStack(elName) != null; 442 } 443 444 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 445 private static boolean onStack(ArrayList<Element> queue, Element element) { 446 final int bottom = queue.size() - 1; 447 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 448 for (int pos = bottom; pos >= upper; pos--) { 449 Element next = queue.get(pos); 450 if (next == element) { 451 return true; 452 } 453 } 454 return false; 455 } 456 457 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 458 @Nullable 459 Element getFromStack(String elName) { 460 final int bottom = stack.size() - 1; 461 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 462 for (int pos = bottom; pos >= upper; pos--) { 463 Element next = stack.get(pos); 464 if (next.elementIs(elName, NamespaceHtml)) { 465 return next; 466 } 467 } 468 return null; 469 } 470 471 boolean removeFromStack(Element el) { 472 for (int pos = stack.size() -1; pos >= 0; pos--) { 473 Element next = stack.get(pos); 474 if (next == el) { 475 stack.remove(pos); 476 onNodeClosed(el); 477 return true; 478 } 479 } 480 return false; 481 } 482 483 /** Pops the stack until the given HTML element is removed. */ 484 @Nullable 485 Element popStackToClose(String elName) { 486 for (int pos = stack.size() -1; pos >= 0; pos--) { 487 Element el = pop(); 488 if (el.elementIs(elName, NamespaceHtml)) { 489 return el; 490 } 491 } 492 return null; 493 } 494 495 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 496 @Nullable 497 Element popStackToCloseAnyNamespace(String elName) { 498 for (int pos = stack.size() -1; pos >= 0; pos--) { 499 Element el = pop(); 500 if (el.nameIs(elName)) { 501 return el; 502 } 503 } 504 return null; 505 } 506 507 /** Pops the stack until one of the given HTML elements is removed. */ 508 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 509 for (int pos = stack.size() -1; pos >= 0; pos--) { 510 Element el = pop(); 511 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 512 break; 513 } 514 } 515 } 516 517 void clearStackToTableContext() { 518 clearStackToContext("table", "template"); 519 } 520 521 void clearStackToTableBodyContext() { 522 clearStackToContext("tbody", "tfoot", "thead", "template"); 523 } 524 525 void clearStackToTableRowContext() { 526 clearStackToContext("tr", "template"); 527 } 528 529 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 530 private void clearStackToContext(String... nodeNames) { 531 for (int pos = stack.size() -1; pos >= 0; pos--) { 532 Element next = stack.get(pos); 533 if (NamespaceHtml.equals(next.tag().namespace()) && 534 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 535 break; 536 else 537 pop(); 538 } 539 } 540 541 /** 542 Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be 543 its parent. 544 545 @param el 546 @return the Element immediately above the supplied element, or null if there is no such element. 547 */ 548 @Nullable Element aboveOnStack(Element el) { 549 assert onStack(el); 550 for (int pos = stack.size() -1; pos >= 0; pos--) { 551 Element next = stack.get(pos); 552 if (next == el) { 553 return stack.get(pos-1); 554 } 555 } 556 return null; 557 } 558 559 void insertOnStackAfter(Element after, Element in) { 560 int i = stack.lastIndexOf(after); 561 Validate.isTrue(i != -1); 562 stack.add(i+1, in); 563 } 564 565 void replaceOnStack(Element out, Element in) { 566 replaceInQueue(stack, out, in); 567 } 568 569 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 570 int i = queue.lastIndexOf(out); 571 Validate.isTrue(i != -1); 572 queue.set(i, in); 573 } 574 575 /** 576 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 577 * is limited to {@link #maxQueueDepth}. 578 * @return true if the insertion mode was actually changed. 579 */ 580 boolean resetInsertionMode() { 581 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 582 boolean last = false; 583 final int bottom = stack.size() - 1; 584 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 585 final HtmlTreeBuilderState origState = this.state; 586 587 if (stack.size() == 0) { // nothing left of stack, just get to body 588 transition(HtmlTreeBuilderState.InBody); 589 } 590 591 LOOP: for (int pos = bottom; pos >= upper; pos--) { 592 Element node = stack.get(pos); 593 if (pos == upper) { 594 last = true; 595 if (fragmentParsing) 596 node = contextElement; 597 } 598 String name = node != null ? node.normalName() : ""; 599 if (!NamespaceHtml.equals(node.tag().namespace())) 600 continue; // only looking for HTML elements here 601 602 switch (name) { 603 case "select": 604 transition(HtmlTreeBuilderState.InSelect); 605 // todo - should loop up (with some limit) and check for table or template hits 606 break LOOP; 607 case "td": 608 case "th": 609 if (!last) { 610 transition(HtmlTreeBuilderState.InCell); 611 break LOOP; 612 } 613 break; 614 case "tr": 615 transition(HtmlTreeBuilderState.InRow); 616 break LOOP; 617 case "tbody": 618 case "thead": 619 case "tfoot": 620 transition(HtmlTreeBuilderState.InTableBody); 621 break LOOP; 622 case "caption": 623 transition(HtmlTreeBuilderState.InCaption); 624 break LOOP; 625 case "colgroup": 626 transition(HtmlTreeBuilderState.InColumnGroup); 627 break LOOP; 628 case "table": 629 transition(HtmlTreeBuilderState.InTable); 630 break LOOP; 631 case "template": 632 HtmlTreeBuilderState tmplState = currentTemplateMode(); 633 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 634 transition(tmplState); 635 break LOOP; 636 case "head": 637 if (!last) { 638 transition(HtmlTreeBuilderState.InHead); 639 break LOOP; 640 } 641 break; 642 case "body": 643 transition(HtmlTreeBuilderState.InBody); 644 break LOOP; 645 case "frameset": 646 transition(HtmlTreeBuilderState.InFrameset); 647 break LOOP; 648 case "html": 649 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 650 break LOOP; 651 } 652 if (last) { 653 transition(HtmlTreeBuilderState.InBody); 654 break; 655 } 656 } 657 return state != origState; 658 } 659 660 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 661 void resetBody() { 662 if (!onStack("body")) { 663 stack.add(doc.body()); // not onNodeInserted, as already seen 664 } 665 transition(HtmlTreeBuilderState.InBody); 666 } 667 668 // todo: tidy up in specific scope methods 669 private final String[] specificScopeTarget = {null}; 670 671 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 672 specificScopeTarget[0] = targetName; 673 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 674 } 675 676 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 677 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 678 final int bottom = stack.size() -1; 679 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 680 // don't walk too far up the tree 681 for (int pos = bottom; pos >= top; pos--) { 682 Element el = stack.get(pos); 683 String elName = el.normalName(); 684 // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg: 685 String ns = el.tag().namespace(); 686 if (ns.equals(NamespaceHtml)) { 687 if (inSorted(elName, targetNames)) 688 return true; 689 if (inSorted(elName, baseTypes)) 690 return false; 691 if (extraTypes != null && inSorted(elName, extraTypes)) 692 return false; 693 } else if (baseTypes == TagsSearchInScope) { 694 if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath)) 695 return false; 696 if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg)) 697 return false; 698 } 699 } 700 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 701 return false; 702 } 703 704 boolean inScope(String[] targetNames) { 705 return inSpecificScope(targetNames, TagsSearchInScope, null); 706 } 707 708 boolean inScope(String targetName) { 709 return inScope(targetName, null); 710 } 711 712 boolean inScope(String targetName, String[] extras) { 713 return inSpecificScope(targetName, TagsSearchInScope, extras); 714 } 715 716 boolean inListItemScope(String targetName) { 717 return inScope(targetName, TagSearchList); 718 } 719 720 boolean inButtonScope(String targetName) { 721 return inScope(targetName, TagSearchButton); 722 } 723 724 boolean inTableScope(String targetName) { 725 return inSpecificScope(targetName, TagSearchTableScope, null); 726 } 727 728 boolean inSelectScope(String targetName) { 729 for (int pos = stack.size() -1; pos >= 0; pos--) { 730 Element el = stack.get(pos); 731 String elName = el.normalName(); 732 if (elName.equals(targetName)) 733 return true; 734 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 735 return false; 736 } 737 Validate.fail("Should not be reachable"); 738 return false; 739 } 740 741 /** Tests if there is some element on the stack that is not in the provided set. */ 742 boolean onStackNot(String[] allowedTags) { 743 final int bottom = stack.size() -1; 744 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 745 // don't walk too far up the tree 746 747 for (int pos = bottom; pos >= top; pos--) { 748 final String elName = stack.get(pos).normalName(); 749 if (!inSorted(elName, allowedTags)) 750 return true; 751 } 752 return false; 753 } 754 755 void setHeadElement(Element headElement) { 756 this.headElement = headElement; 757 } 758 759 Element getHeadElement() { 760 return headElement; 761 } 762 763 boolean isFosterInserts() { 764 return fosterInserts; 765 } 766 767 void setFosterInserts(boolean fosterInserts) { 768 this.fosterInserts = fosterInserts; 769 } 770 771 @Nullable FormElement getFormElement() { 772 return formElement; 773 } 774 775 void setFormElement(FormElement formElement) { 776 this.formElement = formElement; 777 } 778 779 void resetPendingTableCharacters() { 780 pendingTableCharacters.clear(); 781 } 782 783 List<Token.Character> getPendingTableCharacters() { 784 return pendingTableCharacters; 785 } 786 787 void addPendingTableCharacters(Token.Character c) { 788 // make a copy of the token to maintain its state (as Tokens are otherwise reset) 789 Token.Character copy = new Token.Character(c); 790 pendingTableCharacters.add(copy); 791 } 792 793 /** 794 13.2.6.3 Closing elements that have implied end tags 795 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 796 797 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 798 799 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 800 801 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 802 process, then the UA must perform the above steps as if that element was not in the above list. 803 */ 804 void generateImpliedEndTags(String excludeTag) { 805 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 806 if (excludeTag != null && currentElementIs(excludeTag)) 807 break; 808 pop(); 809 } 810 } 811 812 void generateImpliedEndTags() { 813 generateImpliedEndTags(false); 814 } 815 816 /** 817 Pops HTML elements off the stack according to the implied end tag rules 818 @param thorough if we are thorough (includes table elements etc) or not 819 */ 820 void generateImpliedEndTags(boolean thorough) { 821 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 822 while (NamespaceHtml.equals(currentElement().tag().namespace()) 823 && inSorted(currentElement().normalName(), search)) { 824 pop(); 825 } 826 } 827 828 void closeElement(String name) { 829 generateImpliedEndTags(name); 830 if (!name.equals(currentElement().normalName())) error(state()); 831 popStackToClose(name); 832 } 833 834 static boolean isSpecial(Element el) { 835 String namespace = el.tag().namespace(); 836 String name = el.normalName(); 837 switch (namespace) { 838 case NamespaceHtml: 839 return inSorted(name, TagSearchSpecial); 840 case Parser.NamespaceMathml: 841 return inSorted(name, TagSearchSpecialMath); 842 case Parser.NamespaceSvg: 843 return inSorted(name, TagSvgHtmlIntegration); 844 default: 845 return false; 846 } 847 } 848 849 Element lastFormattingElement() { 850 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 851 } 852 853 int positionOfElement(Element el){ 854 for (int i = 0; i < formattingElements.size(); i++){ 855 if (el == formattingElements.get(i)) 856 return i; 857 } 858 return -1; 859 } 860 861 Element removeLastFormattingElement() { 862 int size = formattingElements.size(); 863 if (size > 0) 864 return formattingElements.remove(size-1); 865 else 866 return null; 867 } 868 869 // active formatting elements 870 void pushActiveFormattingElements(Element in) { 871 checkActiveFormattingElements(in); 872 formattingElements.add(in); 873 } 874 875 void pushWithBookmark(Element in, int bookmark){ 876 checkActiveFormattingElements(in); 877 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 878 try { 879 formattingElements.add(bookmark, in); 880 } catch (IndexOutOfBoundsException e) { 881 formattingElements.add(in); 882 } 883 } 884 885 void checkActiveFormattingElements(Element in){ 886 int numSeen = 0; 887 final int size = formattingElements.size() -1; 888 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 889 890 for (int pos = size; pos >= ceil; pos--) { 891 Element el = formattingElements.get(pos); 892 if (el == null) // marker 893 break; 894 895 if (isSameFormattingElement(in, el)) 896 numSeen++; 897 898 if (numSeen == 3) { 899 formattingElements.remove(pos); 900 break; 901 } 902 } 903 } 904 905 private static boolean isSameFormattingElement(Element a, Element b) { 906 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 907 return a.normalName().equals(b.normalName()) && 908 // a.namespace().equals(b.namespace()) && 909 a.attributes().equals(b.attributes()); 910 // todo: namespaces 911 } 912 913 void reconstructFormattingElements() { 914 if (stack.size() > maxQueueDepth) 915 return; 916 Element last = lastFormattingElement(); 917 if (last == null || onStack(last)) 918 return; 919 920 Element entry = last; 921 int size = formattingElements.size(); 922 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 923 int pos = size - 1; 924 boolean skip = false; 925 while (true) { 926 if (pos == ceil) { // step 4. if none before, skip to 8 927 skip = true; 928 break; 929 } 930 entry = formattingElements.get(--pos); // step 5. one earlier than entry 931 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 932 break; // jump to 8, else continue back to 4 933 } 934 while(true) { 935 if (!skip) // step 7: on later than entry 936 entry = formattingElements.get(++pos); 937 Validate.notNull(entry); // should not occur, as we break at last element 938 939 // 8. create new element from element, 9 insert into current node, onto stack 940 skip = false; // can only skip increment from 4. 941 Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone()); 942 doInsertElement(newEl); 943 944 // 10. replace entry with new entry 945 formattingElements.set(pos, newEl); 946 947 // 11 948 if (pos == size-1) // if not last entry in list, jump to 7 949 break; 950 } 951 } 952 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 953 954 void clearFormattingElementsToLastMarker() { 955 while (!formattingElements.isEmpty()) { 956 Element el = removeLastFormattingElement(); 957 if (el == null) 958 break; 959 } 960 } 961 962 void removeFromActiveFormattingElements(Element el) { 963 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 964 Element next = formattingElements.get(pos); 965 if (next == el) { 966 formattingElements.remove(pos); 967 break; 968 } 969 } 970 } 971 972 boolean isInActiveFormattingElements(Element el) { 973 return onStack(formattingElements, el); 974 } 975 976 @Nullable 977 Element getActiveFormattingElement(String nodeName) { 978 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 979 Element next = formattingElements.get(pos); 980 if (next == null) // scope marker 981 break; 982 else if (next.nameIs(nodeName)) 983 return next; 984 } 985 return null; 986 } 987 988 void replaceActiveFormattingElement(Element out, Element in) { 989 replaceInQueue(formattingElements, out, in); 990 } 991 992 void insertMarkerToFormattingElements() { 993 formattingElements.add(null); 994 } 995 996 void insertInFosterParent(Node in) { 997 Element fosterParent; 998 Element lastTable = getFromStack("table"); 999 boolean isLastTableParent = false; 1000 if (lastTable != null) { 1001 if (lastTable.parent() != null) { 1002 fosterParent = lastTable.parent(); 1003 isLastTableParent = true; 1004 } else 1005 fosterParent = aboveOnStack(lastTable); 1006 } else { // no table == frag 1007 fosterParent = stack.get(0); 1008 } 1009 1010 if (isLastTableParent) { 1011 Validate.notNull(lastTable); // last table cannot be null by this point. 1012 lastTable.before(in); 1013 } 1014 else 1015 fosterParent.appendChild(in); 1016 } 1017 1018 // Template Insertion Mode stack 1019 void pushTemplateMode(HtmlTreeBuilderState state) { 1020 tmplInsertMode.add(state); 1021 } 1022 1023 @Nullable HtmlTreeBuilderState popTemplateMode() { 1024 if (tmplInsertMode.size() > 0) { 1025 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1026 } else { 1027 return null; 1028 } 1029 } 1030 1031 int templateModeSize() { 1032 return tmplInsertMode.size(); 1033 } 1034 1035 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1036 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1037 } 1038 1039 @Override 1040 public String toString() { 1041 return "TreeBuilder{" + 1042 "currentToken=" + currentToken + 1043 ", state=" + state + 1044 ", currentElement=" + currentElement() + 1045 '}'; 1046 } 1047 1048}