001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.io.StringReader; 019import java.util.ArrayList; 020import java.util.List; 021 022import static org.jsoup.internal.StringUtil.inSorted; 023import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 024import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 025import static org.jsoup.parser.Parser.NamespaceHtml; 026 027/** 028 * HTML Tree Builder; creates a DOM from Tokens. 029 */ 030public class HtmlTreeBuilder extends TreeBuilder { 031 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 032 static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"}; 033 static final String[] TagSearchList = new String[]{"ol", "ul"}; 034 static final String[] TagSearchButton = new String[]{"button"}; 035 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 036 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 037 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 038 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 039 static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", 040 "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", 041 "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", 042 "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", 043 "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", 044 "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", 045 "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", 046 "title", "tr", "ul", "wbr", "xmp"}; 047 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 048 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 049 050 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 051 052 private HtmlTreeBuilderState state; // the current state 053 private HtmlTreeBuilderState originalState; // original / marked state 054 055 private boolean baseUriSetFromDoc; 056 private @Nullable Element headElement; // the current head element 057 private @Nullable FormElement formElement; // the current form element 058 private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing 059 private ArrayList<Element> formattingElements; // active (open) formatting elements 060 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 061 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 062 private Token.EndTag emptyEnd; // reused empty end tag 063 064 private boolean framesetOk; // if ok to go into frameset 065 private boolean fosterInserts; // if next inserts should be fostered 066 private boolean fragmentParsing; // if parsing a fragment of html 067 068 @Override ParseSettings defaultSettings() { 069 return ParseSettings.htmlDefault; 070 } 071 072 @Override 073 HtmlTreeBuilder newInstance() { 074 return new HtmlTreeBuilder(); 075 } 076 077 @Override 078 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 079 super.initialiseParse(input, baseUri, parser); 080 081 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 082 state = HtmlTreeBuilderState.Initial; 083 originalState = null; 084 baseUriSetFromDoc = false; 085 headElement = null; 086 formElement = null; 087 contextElement = null; 088 formattingElements = new ArrayList<>(); 089 tmplInsertMode = new ArrayList<>(); 090 pendingTableCharacters = new ArrayList<>(); 091 emptyEnd = new Token.EndTag(this); 092 framesetOk = true; 093 fosterInserts = false; 094 fragmentParsing = false; 095 } 096 097 @Override void initialiseParseFragment(@Nullable Element context) { 098 // context may be null 099 state = HtmlTreeBuilderState.Initial; 100 fragmentParsing = true; 101 102 if (context != null) { 103 final String contextName = context.normalName(); 104 contextElement = new Element(tagFor(contextName, settings), baseUri); 105 if (context.ownerDocument() != null) // quirks setup: 106 doc.quirksMode(context.ownerDocument().quirksMode()); 107 108 // initialise the tokeniser state: 109 switch (contextName) { 110 case "title": 111 case "textarea": 112 tokeniser.transition(TokeniserState.Rcdata); 113 break; 114 case "iframe": 115 case "noembed": 116 case "noframes": 117 case "style": 118 case "xmp": 119 tokeniser.transition(TokeniserState.Rawtext); 120 break; 121 case "script": 122 tokeniser.transition(TokeniserState.ScriptData); 123 break; 124 case "plaintext": 125 tokeniser.transition(TokeniserState.PLAINTEXT); 126 break; 127 case "template": 128 tokeniser.transition(TokeniserState.Data); 129 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 130 break; 131 default: 132 tokeniser.transition(TokeniserState.Data); 133 } 134 doc.appendChild(contextElement); 135 push(contextElement); 136 resetInsertionMode(); 137 138 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 139 // with form correctly 140 Element formSearch = context; 141 while (formSearch != null) { 142 if (formSearch instanceof FormElement) { 143 formElement = (FormElement) formSearch; 144 break; 145 } 146 formSearch = formSearch.parent(); 147 } 148 } 149 } 150 151 @Override List<Node> completeParseFragment() { 152 if (contextElement != null) { 153 // depending on context and the input html, content may have been added outside of the root el 154 // e.g. context=p, input=div, the div will have been pushed out. 155 List<Node> nodes = contextElement.siblingNodes(); 156 if (!nodes.isEmpty()) 157 contextElement.insertChildren(-1, nodes); 158 return contextElement.childNodes(); 159 } 160 else 161 return doc.childNodes(); 162 } 163 164 @Override 165 protected boolean process(Token token) { 166 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 167 return dispatch.process(token, this); 168 } 169 170 boolean useCurrentOrForeignInsert(Token token) { 171 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 172 // If the stack of open elements is empty 173 if (stack.isEmpty()) 174 return true; 175 final Element el = currentElement(); 176 final String ns = el.tag().namespace(); 177 178 // If the adjusted current node is an element in the HTML namespace 179 if (NamespaceHtml.equals(ns)) 180 return true; 181 182 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 183 // If the adjusted current node is a MathML text integration point and the token is a character token 184 if (isMathmlTextIntegration(el)) { 185 if (token.isStartTag() 186 && !"mglyph".equals(token.asStartTag().normalName) 187 && !"malignmark".equals(token.asStartTag().normalName)) 188 return true; 189 if (token.isCharacter()) 190 return true; 191 } 192 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 193 if (Parser.NamespaceMathml.equals(ns) 194 && el.nameIs("annotation-xml") 195 && token.isStartTag() 196 && "svg".equals(token.asStartTag().normalName)) 197 return true; 198 199 // If the adjusted current node is an HTML integration point and the token is a start tag 200 // If the adjusted current node is an HTML integration point and the token is a character token 201 if (isHtmlIntegration(el) 202 && (token.isStartTag() || token.isCharacter())) 203 return true; 204 205 // If the token is an end-of-file token 206 return token.isEOF(); 207 } 208 209 static boolean isMathmlTextIntegration(Element el) { 210 /* 211 A node is a MathML text integration point if it is one of the following elements: 212 A MathML mi element 213 A MathML mo element 214 A MathML mn element 215 A MathML ms element 216 A MathML mtext element 217 */ 218 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 219 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 220 } 221 222 static boolean isHtmlIntegration(Element el) { 223 /* 224 A node is an HTML integration point if it is one of the following elements: 225 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 226 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 227 An SVG foreignObject element 228 An SVG desc element 229 An SVG title element 230 */ 231 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 232 && el.nameIs("annotation-xml")) { 233 String encoding = Normalizer.normalize(el.attr("encoding")); 234 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 235 return true; 236 } 237 if (Parser.NamespaceSvg.equals(el.tag().namespace()) 238 && StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject 239 return true; 240 241 return false; 242 } 243 244 boolean process(Token token, HtmlTreeBuilderState state) { 245 return state.process(token, this); 246 } 247 248 void transition(HtmlTreeBuilderState state) { 249 this.state = state; 250 } 251 252 HtmlTreeBuilderState state() { 253 return state; 254 } 255 256 void markInsertionMode() { 257 originalState = state; 258 } 259 260 HtmlTreeBuilderState originalState() { 261 return originalState; 262 } 263 264 void framesetOk(boolean framesetOk) { 265 this.framesetOk = framesetOk; 266 } 267 268 boolean framesetOk() { 269 return framesetOk; 270 } 271 272 Document getDocument() { 273 return doc; 274 } 275 276 String getBaseUri() { 277 return baseUri; 278 } 279 280 void maybeSetBaseUri(Element base) { 281 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 282 return; 283 284 String href = base.absUrl("href"); 285 if (href.length() != 0) { // ignore <base target> etc 286 baseUri = href; 287 baseUriSetFromDoc = true; 288 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 289 } 290 } 291 292 boolean isFragmentParsing() { 293 return fragmentParsing; 294 } 295 296 void error(HtmlTreeBuilderState state) { 297 if (parser.getErrors().canAddError()) 298 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 299 currentToken.tokenType(), currentToken, state)); 300 } 301 302 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 303 // dedupe and normalize the attributes: 304 Attributes attributes = startTag.attributes; 305 if (!forcePreserveCase) 306 attributes = settings.normalizeAttributes(attributes); 307 if (attributes != null && !attributes.isEmpty()) { 308 int dupes = attributes.deduplicate(settings); 309 if (dupes > 0) { 310 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 311 } 312 } 313 314 Tag tag = tagFor(startTag.tagName, namespace, 315 forcePreserveCase ? ParseSettings.preserveCase : settings); 316 317 return (tag.normalName().equals("form")) ? 318 new FormElement(tag, null, attributes) : 319 new Element(tag, null, attributes); 320 } 321 322 /** Inserts an HTML element for the given tag) */ 323 Element insertElementFor(final Token.StartTag startTag) { 324 Element el = createElementFor(startTag, NamespaceHtml, false); 325 doInsertElement(el, startTag); 326 327 // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. 328 if (startTag.isSelfClosing()) { 329 Tag tag = el.tag(); 330 if (tag.isKnownTag()) { 331 if (!tag.isEmpty()) 332 tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName()); 333 // else: ok 334 } 335 else { // unknown tag: remember this is self-closing, for output 336 tag.setSelfClosing(); 337 } 338 339 // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 340 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 341 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 342 } 343 344 return el; 345 } 346 347 /** 348 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 349 */ 350 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 351 Element el = createElementFor(startTag, namespace, true); 352 doInsertElement(el, startTag); 353 354 if (startTag.isSelfClosing()) { 355 el.tag().setSelfClosing(); // remember this is self-closing for output 356 pop(); 357 } 358 359 return el; 360 } 361 362 Element insertEmptyElementFor(Token.StartTag startTag) { 363 Element el = createElementFor(startTag, NamespaceHtml, false); 364 doInsertElement(el, startTag); 365 pop(); 366 return el; 367 } 368 369 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 370 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 371 372 if (checkTemplateStack) { 373 if(!onStack("template")) 374 setFormElement(el); 375 } else 376 setFormElement(el); 377 378 doInsertElement(el, startTag); 379 if (!onStack) pop(); 380 return el; 381 } 382 383 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 384 tests on the Element before insertion. 385 * @param el the Element to insert and make the current element 386 * @param token the token this element was parsed from. If null, uses a zero-width current token as intrinsic insert 387 */ 388 private void doInsertElement(Element el, @Nullable Token token) { 389 if (el.tag().isFormListed() && formElement != null) 390 formElement.addElement(el); // connect form controls to their form element 391 392 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 393 if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 394 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 395 396 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 397 insertInFosterParent(el); 398 else 399 currentElement().appendChild(el); 400 401 push(el); 402 } 403 404 void insertCommentNode(Token.Comment token) { 405 Comment node = new Comment(token.getData()); 406 currentElement().appendChild(node); 407 onNodeInserted(node); 408 } 409 410 /** Inserts the provided character token into the current element. */ 411 void insertCharacterNode(Token.Character characterToken) { 412 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 413 insertCharacterToElement(characterToken, el); 414 } 415 416 /** Inserts the provided character token into the provided element. */ 417 void insertCharacterToElement(Token.Character characterToken, Element el) { 418 final Node node; 419 final String tagName = el.normalName(); 420 final String data = characterToken.getData(); 421 422 if (characterToken.isCData()) 423 node = new CDataNode(data); 424 else if (isContentForTagData(tagName)) 425 node = new DataNode(data); 426 else 427 node = new TextNode(data); 428 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 429 onNodeInserted(node); 430 } 431 432 ArrayList<Element> getStack() { 433 return stack; 434 } 435 436 boolean onStack(Element el) { 437 return onStack(stack, el); 438 } 439 440 /** Checks if there is an HTML element with the given name on the stack. */ 441 boolean onStack(String elName) { 442 return getFromStack(elName) != null; 443 } 444 445 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 446 private static boolean onStack(ArrayList<Element> queue, Element element) { 447 final int bottom = queue.size() - 1; 448 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 449 for (int pos = bottom; pos >= upper; pos--) { 450 Element next = queue.get(pos); 451 if (next == element) { 452 return true; 453 } 454 } 455 return false; 456 } 457 458 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 459 @Nullable 460 Element getFromStack(String elName) { 461 final int bottom = stack.size() - 1; 462 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 463 for (int pos = bottom; pos >= upper; pos--) { 464 Element next = stack.get(pos); 465 if (next.elementIs(elName, NamespaceHtml)) { 466 return next; 467 } 468 } 469 return null; 470 } 471 472 boolean removeFromStack(Element el) { 473 for (int pos = stack.size() -1; pos >= 0; pos--) { 474 Element next = stack.get(pos); 475 if (next == el) { 476 stack.remove(pos); 477 onNodeClosed(el); 478 return true; 479 } 480 } 481 return false; 482 } 483 484 /** Pops the stack until the given HTML element is removed. */ 485 @Nullable 486 Element popStackToClose(String elName) { 487 for (int pos = stack.size() -1; pos >= 0; pos--) { 488 Element el = pop(); 489 if (el.elementIs(elName, NamespaceHtml)) { 490 return el; 491 } 492 } 493 return null; 494 } 495 496 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 497 @Nullable 498 Element popStackToCloseAnyNamespace(String elName) { 499 for (int pos = stack.size() -1; pos >= 0; pos--) { 500 Element el = pop(); 501 if (el.nameIs(elName)) { 502 return el; 503 } 504 } 505 return null; 506 } 507 508 /** Pops the stack until one of the given HTML elements is removed. */ 509 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 510 for (int pos = stack.size() -1; pos >= 0; pos--) { 511 Element el = pop(); 512 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 513 break; 514 } 515 } 516 } 517 518 void clearStackToTableContext() { 519 clearStackToContext("table", "template"); 520 } 521 522 void clearStackToTableBodyContext() { 523 clearStackToContext("tbody", "tfoot", "thead", "template"); 524 } 525 526 void clearStackToTableRowContext() { 527 clearStackToContext("tr", "template"); 528 } 529 530 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 531 private void clearStackToContext(String... nodeNames) { 532 for (int pos = stack.size() -1; pos >= 0; pos--) { 533 Element next = stack.get(pos); 534 if (NamespaceHtml.equals(next.tag().namespace()) && 535 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 536 break; 537 else 538 pop(); 539 } 540 } 541 542 @Nullable Element aboveOnStack(Element el) { 543 assert onStack(el); 544 for (int pos = stack.size() -1; pos >= 0; pos--) { 545 Element next = stack.get(pos); 546 if (next == el) { 547 return stack.get(pos-1); 548 } 549 } 550 return null; 551 } 552 553 void insertOnStackAfter(Element after, Element in) { 554 int i = stack.lastIndexOf(after); 555 Validate.isTrue(i != -1); 556 stack.add(i+1, in); 557 } 558 559 void replaceOnStack(Element out, Element in) { 560 replaceInQueue(stack, out, in); 561 } 562 563 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 564 int i = queue.lastIndexOf(out); 565 Validate.isTrue(i != -1); 566 queue.set(i, in); 567 } 568 569 /** 570 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 571 * is limited to {@link #maxQueueDepth}. 572 * @return true if the insertion mode was actually changed. 573 */ 574 boolean resetInsertionMode() { 575 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 576 boolean last = false; 577 final int bottom = stack.size() - 1; 578 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 579 final HtmlTreeBuilderState origState = this.state; 580 581 if (stack.size() == 0) { // nothing left of stack, just get to body 582 transition(HtmlTreeBuilderState.InBody); 583 } 584 585 LOOP: for (int pos = bottom; pos >= upper; pos--) { 586 Element node = stack.get(pos); 587 if (pos == upper) { 588 last = true; 589 if (fragmentParsing) 590 node = contextElement; 591 } 592 String name = node != null ? node.normalName() : ""; 593 if (!NamespaceHtml.equals(node.tag().namespace())) 594 continue; // only looking for HTML elements here 595 596 switch (name) { 597 case "select": 598 transition(HtmlTreeBuilderState.InSelect); 599 // todo - should loop up (with some limit) and check for table or template hits 600 break LOOP; 601 case "td": 602 case "th": 603 if (!last) { 604 transition(HtmlTreeBuilderState.InCell); 605 break LOOP; 606 } 607 break; 608 case "tr": 609 transition(HtmlTreeBuilderState.InRow); 610 break LOOP; 611 case "tbody": 612 case "thead": 613 case "tfoot": 614 transition(HtmlTreeBuilderState.InTableBody); 615 break LOOP; 616 case "caption": 617 transition(HtmlTreeBuilderState.InCaption); 618 break LOOP; 619 case "colgroup": 620 transition(HtmlTreeBuilderState.InColumnGroup); 621 break LOOP; 622 case "table": 623 transition(HtmlTreeBuilderState.InTable); 624 break LOOP; 625 case "template": 626 HtmlTreeBuilderState tmplState = currentTemplateMode(); 627 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 628 transition(tmplState); 629 break LOOP; 630 case "head": 631 if (!last) { 632 transition(HtmlTreeBuilderState.InHead); 633 break LOOP; 634 } 635 break; 636 case "body": 637 transition(HtmlTreeBuilderState.InBody); 638 break LOOP; 639 case "frameset": 640 transition(HtmlTreeBuilderState.InFrameset); 641 break LOOP; 642 case "html": 643 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 644 break LOOP; 645 } 646 if (last) { 647 transition(HtmlTreeBuilderState.InBody); 648 break; 649 } 650 } 651 return state != origState; 652 } 653 654 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 655 void resetBody() { 656 if (!onStack("body")) { 657 stack.add(doc.body()); // not onNodeInserted, as already seen 658 } 659 transition(HtmlTreeBuilderState.InBody); 660 } 661 662 // todo: tidy up in specific scope methods 663 private final String[] specificScopeTarget = {null}; 664 665 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 666 specificScopeTarget[0] = targetName; 667 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 668 } 669 670 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 671 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 672 final int bottom = stack.size() -1; 673 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 674 // don't walk too far up the tree 675 676 for (int pos = bottom; pos >= top; pos--) { 677 Element el = stack.get(pos); 678 if (!el.tag().namespace().equals(NamespaceHtml)) continue; 679 680 final String elName = el.normalName(); 681 if (inSorted(elName, targetNames)) 682 return true; 683 if (inSorted(elName, baseTypes)) 684 return false; 685 if (extraTypes != null && inSorted(elName, extraTypes)) 686 return false; 687 } 688 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 689 return false; 690 } 691 692 boolean inScope(String[] targetNames) { 693 return inSpecificScope(targetNames, TagsSearchInScope, null); 694 } 695 696 boolean inScope(String targetName) { 697 return inScope(targetName, null); 698 } 699 700 boolean inScope(String targetName, String[] extras) { 701 return inSpecificScope(targetName, TagsSearchInScope, extras); 702 // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml 703 // todo: in svg namespace: forignOjbect, desc, title 704 } 705 706 boolean inListItemScope(String targetName) { 707 return inScope(targetName, TagSearchList); 708 } 709 710 boolean inButtonScope(String targetName) { 711 return inScope(targetName, TagSearchButton); 712 } 713 714 boolean inTableScope(String targetName) { 715 return inSpecificScope(targetName, TagSearchTableScope, null); 716 } 717 718 boolean inSelectScope(String targetName) { 719 for (int pos = stack.size() -1; pos >= 0; pos--) { 720 Element el = stack.get(pos); 721 String elName = el.normalName(); 722 if (elName.equals(targetName)) 723 return true; 724 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 725 return false; 726 } 727 Validate.fail("Should not be reachable"); 728 return false; 729 } 730 731 /** Tests if there is some element on the stack that is not in the provided set. */ 732 boolean onStackNot(String[] allowedTags) { 733 final int bottom = stack.size() -1; 734 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 735 // don't walk too far up the tree 736 737 for (int pos = bottom; pos >= top; pos--) { 738 final String elName = stack.get(pos).normalName(); 739 if (!inSorted(elName, allowedTags)) 740 return true; 741 } 742 return false; 743 } 744 745 void setHeadElement(Element headElement) { 746 this.headElement = headElement; 747 } 748 749 Element getHeadElement() { 750 return headElement; 751 } 752 753 boolean isFosterInserts() { 754 return fosterInserts; 755 } 756 757 void setFosterInserts(boolean fosterInserts) { 758 this.fosterInserts = fosterInserts; 759 } 760 761 @Nullable FormElement getFormElement() { 762 return formElement; 763 } 764 765 void setFormElement(FormElement formElement) { 766 this.formElement = formElement; 767 } 768 769 void resetPendingTableCharacters() { 770 pendingTableCharacters.clear(); 771 } 772 773 List<Token.Character> getPendingTableCharacters() { 774 return pendingTableCharacters; 775 } 776 777 void addPendingTableCharacters(Token.Character c) { 778 // make a clone of the token to maintain its state (as Tokens are otherwise reset) 779 Token.Character clone = c.clone(); 780 pendingTableCharacters.add(clone); 781 } 782 783 /** 784 13.2.6.3 Closing elements that have implied end tags 785 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 786 787 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 788 789 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 790 791 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 792 process, then the UA must perform the above steps as if that element was not in the above list. 793 */ 794 void generateImpliedEndTags(String excludeTag) { 795 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 796 if (excludeTag != null && currentElementIs(excludeTag)) 797 break; 798 pop(); 799 } 800 } 801 802 void generateImpliedEndTags() { 803 generateImpliedEndTags(false); 804 } 805 806 /** 807 Pops HTML elements off the stack according to the implied end tag rules 808 @param thorough if we are thorough (includes table elements etc) or not 809 */ 810 void generateImpliedEndTags(boolean thorough) { 811 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 812 while (NamespaceHtml.equals(currentElement().tag().namespace()) 813 && inSorted(currentElement().normalName(), search)) { 814 pop(); 815 } 816 } 817 818 void closeElement(String name) { 819 generateImpliedEndTags(name); 820 if (!name.equals(currentElement().normalName())) error(state()); 821 popStackToClose(name); 822 } 823 824 static boolean isSpecial(Element el) { 825 // todo: mathml's mi, mo, mn 826 // todo: svg's foreigObject, desc, title 827 String name = el.normalName(); 828 return inSorted(name, TagSearchSpecial); 829 } 830 831 Element lastFormattingElement() { 832 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 833 } 834 835 int positionOfElement(Element el){ 836 for (int i = 0; i < formattingElements.size(); i++){ 837 if (el == formattingElements.get(i)) 838 return i; 839 } 840 return -1; 841 } 842 843 Element removeLastFormattingElement() { 844 int size = formattingElements.size(); 845 if (size > 0) 846 return formattingElements.remove(size-1); 847 else 848 return null; 849 } 850 851 // active formatting elements 852 void pushActiveFormattingElements(Element in) { 853 checkActiveFormattingElements(in); 854 formattingElements.add(in); 855 } 856 857 void pushWithBookmark(Element in, int bookmark){ 858 checkActiveFormattingElements(in); 859 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 860 try { 861 formattingElements.add(bookmark, in); 862 } catch (IndexOutOfBoundsException e) { 863 formattingElements.add(in); 864 } 865 } 866 867 void checkActiveFormattingElements(Element in){ 868 int numSeen = 0; 869 final int size = formattingElements.size() -1; 870 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 871 872 for (int pos = size; pos >= ceil; pos--) { 873 Element el = formattingElements.get(pos); 874 if (el == null) // marker 875 break; 876 877 if (isSameFormattingElement(in, el)) 878 numSeen++; 879 880 if (numSeen == 3) { 881 formattingElements.remove(pos); 882 break; 883 } 884 } 885 } 886 887 private static boolean isSameFormattingElement(Element a, Element b) { 888 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 889 return a.normalName().equals(b.normalName()) && 890 // a.namespace().equals(b.namespace()) && 891 a.attributes().equals(b.attributes()); 892 // todo: namespaces 893 } 894 895 void reconstructFormattingElements() { 896 if (stack.size() > maxQueueDepth) 897 return; 898 Element last = lastFormattingElement(); 899 if (last == null || onStack(last)) 900 return; 901 902 Element entry = last; 903 int size = formattingElements.size(); 904 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 905 int pos = size - 1; 906 boolean skip = false; 907 while (true) { 908 if (pos == ceil) { // step 4. if none before, skip to 8 909 skip = true; 910 break; 911 } 912 entry = formattingElements.get(--pos); // step 5. one earlier than entry 913 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 914 break; // jump to 8, else continue back to 4 915 } 916 while(true) { 917 if (!skip) // step 7: on later than entry 918 entry = formattingElements.get(++pos); 919 Validate.notNull(entry); // should not occur, as we break at last element 920 921 // 8. create new element from element, 9 insert into current node, onto stack 922 skip = false; // can only skip increment from 4. 923 Element newEl = new Element(tagFor(entry.normalName(), settings), null, entry.attributes().clone()); 924 doInsertElement(newEl, null); 925 926 // 10. replace entry with new entry 927 formattingElements.set(pos, newEl); 928 929 // 11 930 if (pos == size-1) // if not last entry in list, jump to 7 931 break; 932 } 933 } 934 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 935 936 void clearFormattingElementsToLastMarker() { 937 while (!formattingElements.isEmpty()) { 938 Element el = removeLastFormattingElement(); 939 if (el == null) 940 break; 941 } 942 } 943 944 void removeFromActiveFormattingElements(Element el) { 945 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 946 Element next = formattingElements.get(pos); 947 if (next == el) { 948 formattingElements.remove(pos); 949 break; 950 } 951 } 952 } 953 954 boolean isInActiveFormattingElements(Element el) { 955 return onStack(formattingElements, el); 956 } 957 958 @Nullable 959 Element getActiveFormattingElement(String nodeName) { 960 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 961 Element next = formattingElements.get(pos); 962 if (next == null) // scope marker 963 break; 964 else if (next.nameIs(nodeName)) 965 return next; 966 } 967 return null; 968 } 969 970 void replaceActiveFormattingElement(Element out, Element in) { 971 replaceInQueue(formattingElements, out, in); 972 } 973 974 void insertMarkerToFormattingElements() { 975 formattingElements.add(null); 976 } 977 978 void insertInFosterParent(Node in) { 979 Element fosterParent; 980 Element lastTable = getFromStack("table"); 981 boolean isLastTableParent = false; 982 if (lastTable != null) { 983 if (lastTable.parent() != null) { 984 fosterParent = lastTable.parent(); 985 isLastTableParent = true; 986 } else 987 fosterParent = aboveOnStack(lastTable); 988 } else { // no table == frag 989 fosterParent = stack.get(0); 990 } 991 992 if (isLastTableParent) { 993 Validate.notNull(lastTable); // last table cannot be null by this point. 994 lastTable.before(in); 995 } 996 else 997 fosterParent.appendChild(in); 998 } 999 1000 // Template Insertion Mode stack 1001 void pushTemplateMode(HtmlTreeBuilderState state) { 1002 tmplInsertMode.add(state); 1003 } 1004 1005 @Nullable HtmlTreeBuilderState popTemplateMode() { 1006 if (tmplInsertMode.size() > 0) { 1007 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1008 } else { 1009 return null; 1010 } 1011 } 1012 1013 int templateModeSize() { 1014 return tmplInsertMode.size(); 1015 } 1016 1017 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1018 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1019 } 1020 1021 @Override 1022 public String toString() { 1023 return "TreeBuilder{" + 1024 "currentToken=" + currentToken + 1025 ", state=" + state + 1026 ", currentElement=" + currentElement() + 1027 '}'; 1028 } 1029 1030 @Override protected boolean isContentForTagData(final String normalName) { 1031 return (normalName.equals("script") || normalName.equals("style")); 1032 } 1033}