001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.Normalizer;
005import org.jsoup.internal.StringUtil;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.nodes.CDataNode;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.DataNode;
010import org.jsoup.nodes.Document;
011import org.jsoup.nodes.Element;
012import org.jsoup.nodes.FormElement;
013import org.jsoup.nodes.Node;
014import org.jsoup.nodes.TextNode;
015import org.jspecify.annotations.Nullable;
016
017import java.io.Reader;
018import java.util.ArrayList;
019import java.util.List;
020
021import static org.jsoup.internal.StringUtil.inSorted;
022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;
024import static org.jsoup.parser.Parser.*;
025
026/**
027 * HTML Tree Builder; creates a DOM from Tokens.
028 */
029public class HtmlTreeBuilder extends TreeBuilder {
030    // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted.
031    static final String[] TagsSearchInScope = new String[]{ // a particular element in scope
032        "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th"
033    };
034    // math and svg namespaces for particular element in scope
035    static final String[]TagSearchInScopeMath = new String[] {
036        "annotation-xml",  "mi", "mn", "mo", "ms", "mtext"
037    };
038    static final String[]TagSearchInScopeSvg = new String[] {
039        "desc", "foreignObject", "title"
040    };
041
042    static final String[] TagSearchList = new String[]{"ol", "ul"};
043    static final String[] TagSearchButton = new String[]{"button"};
044    static final String[] TagSearchTableScope = new String[]{"html", "table"};
045    static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"};
046    static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"};
047    static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
048    static final String[] TagSearchSpecial = new String[]{
049        "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br",
050        "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed",
051        "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6",
052        "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main",
053        "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext",
054        "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td",
055        "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"};
056    static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml
057    static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
058    static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};
059    static final String[] TagFormListed = {
060        "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
061    };
062
063    public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
064
065    private HtmlTreeBuilderState state; // the current state
066    private HtmlTreeBuilderState originalState; // original / marked state
067
068    private boolean baseUriSetFromDoc;
069    private @Nullable Element headElement; // the current head element
070    private @Nullable FormElement formElement; // the current form element
071    private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
072    ArrayList<Element> formattingElements; // active (open) formatting elements
073    private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
074    private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
075    private Token.EndTag emptyEnd; // reused empty end tag
076
077    private boolean framesetOk; // if ok to go into frameset
078    private boolean fosterInserts; // if next inserts should be fostered
079    private boolean fragmentParsing; // if parsing a fragment of html
080
081    @Override ParseSettings defaultSettings() {
082        return ParseSettings.htmlDefault;
083    }
084
085    @Override
086    HtmlTreeBuilder newInstance() {
087        return new HtmlTreeBuilder();
088    }
089
090    @Override
091    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
092        super.initialiseParse(input, baseUri, parser);
093
094        // this is a bit mucky. todo - probably just create new parser objects to ensure all reset.
095        state = HtmlTreeBuilderState.Initial;
096        originalState = null;
097        baseUriSetFromDoc = false;
098        headElement = null;
099        formElement = null;
100        contextElement = null;
101        formattingElements = new ArrayList<>();
102        tmplInsertMode = new ArrayList<>();
103        pendingTableCharacters = new ArrayList<>();
104        emptyEnd = new Token.EndTag(this);
105        framesetOk = true;
106        fosterInserts = false;
107        fragmentParsing = false;
108    }
109
110    @Override void initialiseParseFragment(@Nullable Element context) {
111        // context may be null
112        state = HtmlTreeBuilderState.Initial;
113        fragmentParsing = true;
114
115        if (context != null) {
116            final String contextName = context.normalName();
117            contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri);
118            if (context.ownerDocument() != null) // quirks setup:
119                doc.quirksMode(context.ownerDocument().quirksMode());
120
121            // initialise the tokeniser state:
122            switch (contextName) {
123                case "script":
124                    tokeniser.transition(TokeniserState.ScriptData);
125                    break;
126                case "plaintext":
127                    tokeniser.transition(TokeniserState.PLAINTEXT);
128                    break;
129                case "template":
130                    tokeniser.transition(TokeniserState.Data);
131                    pushTemplateMode(HtmlTreeBuilderState.InTemplate);
132                    break;
133                default:
134                    Tag tag = contextElement.tag();
135                    TokeniserState textState = tag.textState();
136                    if (textState != null)
137                        tokeniser.transition(textState); // style, xmp, title, textarea, etc; or custom
138                    else
139                        tokeniser.transition(TokeniserState.Data);
140            }
141            doc.appendChild(contextElement);
142            push(contextElement);
143            resetInsertionMode();
144
145            // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
146            // with form correctly
147            Element formSearch = context;
148            while (formSearch != null) {
149                if (formSearch instanceof FormElement) {
150                    formElement = (FormElement) formSearch;
151                    break;
152                }
153                formSearch = formSearch.parent();
154            }
155        }
156    }
157
158    @Override List<Node> completeParseFragment() {
159        if (contextElement != null) {
160            // depending on context and the input html, content may have been added outside of the root el
161            // e.g. context=p, input=div, the div will have been pushed out.
162            List<Node> nodes = contextElement.siblingNodes();
163            if (!nodes.isEmpty())
164                contextElement.insertChildren(-1, nodes);
165            return contextElement.childNodes();
166        }
167        else
168            return doc.childNodes();
169    }
170
171    @Override
172    protected boolean process(Token token) {
173        HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent;
174        return dispatch.process(token, this);
175    }
176
177    boolean useCurrentOrForeignInsert(Token token) {
178        // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
179        // If the stack of open elements is empty
180        if (stack.isEmpty())
181            return true;
182        final Element el = currentElement();
183        final String ns = el.tag().namespace();
184
185        // If the adjusted current node is an element in the HTML namespace
186        if (NamespaceHtml.equals(ns))
187            return true;
188
189        // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
190        // If the adjusted current node is a MathML text integration point and the token is a character token
191        if (isMathmlTextIntegration(el)) {
192            if (token.isStartTag()
193                    && !"mglyph".equals(token.asStartTag().normalName)
194                    && !"malignmark".equals(token.asStartTag().normalName))
195                    return true;
196            if (token.isCharacter())
197                    return true;
198        }
199        // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
200        if (Parser.NamespaceMathml.equals(ns)
201            && el.nameIs("annotation-xml")
202            && token.isStartTag()
203            && "svg".equals(token.asStartTag().normalName))
204            return true;
205
206        // If the adjusted current node is an HTML integration point and the token is a start tag
207        // If the adjusted current node is an HTML integration point and the token is a character token
208        if (isHtmlIntegration(el)
209            && (token.isStartTag() || token.isCharacter()))
210            return true;
211
212        // If the token is an end-of-file token
213        return token.isEOF();
214    }
215
216    static boolean isMathmlTextIntegration(Element el) {
217        /*
218        A node is a MathML text integration point if it is one of the following elements:
219        A MathML mi element
220        A MathML mo element
221        A MathML mn element
222        A MathML ms element
223        A MathML mtext element
224         */
225        return (Parser.NamespaceMathml.equals(el.tag().namespace())
226            && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
227    }
228
229    static boolean isHtmlIntegration(Element el) {
230        /*
231        A node is an HTML integration point if it is one of the following elements:
232        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
233        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
234        An SVG foreignObject element
235        An SVG desc element
236        An SVG title element
237         */
238        if (Parser.NamespaceMathml.equals(el.tag().namespace())
239            && el.nameIs("annotation-xml")) {
240            String encoding = Normalizer.normalize(el.attr("encoding"));
241            if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
242                return true;
243        }
244        // note using .tagName for case-sensitive hit here of foreignObject
245        return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration);
246    }
247
248    boolean process(Token token, HtmlTreeBuilderState state) {
249        return state.process(token, this);
250    }
251
252    void transition(HtmlTreeBuilderState state) {
253        this.state = state;
254    }
255
256    HtmlTreeBuilderState state() {
257        return state;
258    }
259
260    void markInsertionMode() {
261        originalState = state;
262    }
263
264    HtmlTreeBuilderState originalState() {
265        return originalState;
266    }
267
268    void framesetOk(boolean framesetOk) {
269        this.framesetOk = framesetOk;
270    }
271
272    boolean framesetOk() {
273        return framesetOk;
274    }
275
276    Document getDocument() {
277        return doc;
278    }
279
280    String getBaseUri() {
281        return baseUri;
282    }
283
284    void maybeSetBaseUri(Element base) {
285        if (baseUriSetFromDoc) // only listen to the first <base href> in parse
286            return;
287
288        String href = base.absUrl("href");
289        if (href.length() != 0) { // ignore <base target> etc
290            baseUri = href;
291            baseUriSetFromDoc = true;
292            doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
293        }
294    }
295
296    boolean isFragmentParsing() {
297        return fragmentParsing;
298    }
299
300    void error(HtmlTreeBuilderState state) {
301        if (parser.getErrors().canAddError())
302            parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]",
303                currentToken.tokenType(), currentToken, state));
304    }
305
306    Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) {
307        // dedupe and normalize the attributes:
308        Attributes attributes = startTag.attributes;
309        if (attributes != null && !attributes.isEmpty()) {
310            if (!forcePreserveCase)
311                settings.normalizeAttributes(attributes);
312            int dupes = attributes.deduplicate(settings);
313            if (dupes > 0) {
314                error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName);
315            }
316        }
317
318        Tag tag = tagFor(startTag.name(), startTag.normalName, namespace,
319            forcePreserveCase ? ParseSettings.preserveCase : settings);
320
321        return (tag.normalName().equals("form")) ?
322            new FormElement(tag, null, attributes) :
323            new Element(tag, null, attributes);
324    }
325
326    /** Inserts an HTML element for the given tag */
327    Element insertElementFor(final Token.StartTag startTag) {
328        Element el = createElementFor(startTag, NamespaceHtml, false);
329        doInsertElement(el);
330
331        // handle self-closing tags. when the spec expects an empty (void) tag, will directly hit insertEmpty, so won't generate this fake end tag.
332        if (startTag.isSelfClosing()) {
333            Tag tag = el.tag();
334            tag.setSeenSelfClose(); // can infer output if in xml syntax
335            if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) {
336                // ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
337                tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data
338                tokeniser.emit(emptyEnd.reset().name(el.tagName()));  // ensure we get out of whatever state we are in. emitted for yielded processing
339            } else {
340                // error it, and leave the inserted element on
341                tokeniser.error("Tag [%s] cannot be self-closing; not a void tag", tag.normalName());
342            }
343        }
344
345        return el;
346    }
347
348    /**
349     Inserts a foreign element. Preserves the case of the tag name and of the attributes.
350     */
351    Element insertForeignElementFor(final Token.StartTag startTag, String namespace) {
352        Element el = createElementFor(startTag, namespace, true);
353        doInsertElement(el);
354
355        if (startTag.isSelfClosing()) { // foreign els are OK to self-close
356            el.tag().setSeenSelfClose(); // remember this is self-closing for output
357            pop();
358        }
359
360        return el;
361    }
362
363    Element insertEmptyElementFor(Token.StartTag startTag) {
364        Element el = createElementFor(startTag, NamespaceHtml, false);
365        doInsertElement(el);
366        pop();
367        return el;
368    }
369
370    FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) {
371        FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false);
372
373        if (checkTemplateStack) {
374            if(!onStack("template"))
375                setFormElement(el);
376        } else
377            setFormElement(el);
378
379        doInsertElement(el);
380        if (!onStack) pop();
381        return el;
382    }
383
384    /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general
385     tests on the Element before insertion.
386     * @param el the Element to insert and make the current element
387     */
388    private void doInsertElement(Element el) {
389        if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed))
390            formElement.addElement(el); // connect form controls to their form element
391
392        // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to
393        if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace()))
394            error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName());
395
396        if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster))
397            insertInFosterParent(el);
398        else
399            currentElement().appendChild(el);
400
401        push(el);
402    }
403
404    void insertCommentNode(Token.Comment token) {
405        Comment node = new Comment(token.getData());
406        currentElement().appendChild(node);
407        onNodeInserted(node);
408    }
409
410    /** Inserts the provided character token into the current element. */
411    void insertCharacterNode(Token.Character characterToken) {
412        Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
413        insertCharacterToElement(characterToken, el);
414    }
415
416    /** Inserts the provided character token into the provided element. */
417    void insertCharacterToElement(Token.Character characterToken, Element el) {
418        final Node node;
419        final String data = characterToken.getData();
420
421        if (characterToken.isCData())
422            node = new CDataNode(data);
423        else if (el.tag().is(Tag.Data))
424            node = new DataNode(data);
425        else
426            node = new TextNode(data);
427        el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
428        onNodeInserted(node);
429    }
430
431    ArrayList<Element> getStack() {
432        return stack;
433    }
434
435    boolean onStack(Element el) {
436        return onStack(stack, el);
437    }
438
439    /** Checks if there is an HTML element with the given name on the stack. */
440    boolean onStack(String elName) {
441        return getFromStack(elName) != null;
442    }
443
444    private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain
445    private static boolean onStack(ArrayList<Element> queue, Element element) {
446        final int bottom = queue.size() - 1;
447        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
448        for (int pos = bottom; pos >= upper; pos--) {
449            Element next = queue.get(pos);
450            if (next == element) {
451                return true;
452            }
453        }
454        return false;
455    }
456
457    /** Gets the nearest (lowest) HTML element with the given name from the stack. */
458    @Nullable
459    Element getFromStack(String elName) {
460        final int bottom = stack.size() - 1;
461        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
462        for (int pos = bottom; pos >= upper; pos--) {
463            Element next = stack.get(pos);
464            if (next.elementIs(elName, NamespaceHtml)) {
465                return next;
466            }
467        }
468        return null;
469    }
470
471    boolean removeFromStack(Element el) {
472        for (int pos = stack.size() -1; pos >= 0; pos--) {
473            Element next = stack.get(pos);
474            if (next == el) {
475                stack.remove(pos);
476                onNodeClosed(el);
477                return true;
478            }
479        }
480        return false;
481    }
482
483    /** Pops the stack until the given HTML element is removed. */
484    @Nullable
485    Element popStackToClose(String elName) {
486        for (int pos = stack.size() -1; pos >= 0; pos--) {
487            Element el = pop();
488            if (el.elementIs(elName, NamespaceHtml)) {
489                return el;
490            }
491        }
492        return null;
493    }
494
495    /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */
496    @Nullable
497    Element popStackToCloseAnyNamespace(String elName) {
498        for (int pos = stack.size() -1; pos >= 0; pos--) {
499            Element el = pop();
500            if (el.nameIs(elName)) {
501                return el;
502            }
503        }
504        return null;
505    }
506
507    /** Pops the stack until one of the given HTML elements is removed. */
508    void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants
509        for (int pos = stack.size() -1; pos >= 0; pos--) {
510            Element el = pop();
511            if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) {
512                break;
513            }
514        }
515    }
516
517    void clearStackToTableContext() {
518        clearStackToContext("table", "template");
519    }
520
521    void clearStackToTableBodyContext() {
522        clearStackToContext("tbody", "tfoot", "thead", "template");
523    }
524
525    void clearStackToTableRowContext() {
526        clearStackToContext("tr", "template");
527    }
528
529    /** Removes elements from the stack until one of the supplied HTML elements is removed. */
530    private void clearStackToContext(String... nodeNames) {
531        for (int pos = stack.size() -1; pos >= 0; pos--) {
532            Element next = stack.get(pos);
533            if (NamespaceHtml.equals(next.tag().namespace()) &&
534                (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html")))
535                break;
536            else
537                pop();
538        }
539    }
540
541    /**
542     Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be
543     its parent.
544
545     @param el
546     @return the Element immediately above the supplied element, or null if there is no such element.
547     */
548    @Nullable Element aboveOnStack(Element el) {
549        if (!onStack(el)) return null;
550        for (int pos = stack.size() -1; pos > 0; pos--) {
551            Element next = stack.get(pos);
552            if (next == el) {
553                return stack.get(pos-1);
554            }
555        }
556        return null;
557    }
558
559    void insertOnStackAfter(Element after, Element in) {
560        int i = stack.lastIndexOf(after);
561        Validate.isTrue(i != -1);
562        stack.add(i+1, in);
563    }
564
565    void replaceOnStack(Element out, Element in) {
566        replaceInQueue(stack, out, in);
567    }
568
569    private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) {
570        int i = queue.lastIndexOf(out);
571        Validate.isTrue(i != -1);
572        queue.set(i, in);
573    }
574
575    /**
576     * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth
577     * is limited to {@link #maxQueueDepth}.
578     * @return true if the insertion mode was actually changed.
579     */
580    boolean resetInsertionMode() {
581        // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
582        boolean last = false;
583        final int bottom = stack.size() - 1;
584        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
585        final HtmlTreeBuilderState origState = this.state;
586
587        if (stack.size() == 0) { // nothing left of stack, just get to body
588            transition(HtmlTreeBuilderState.InBody);
589        }
590
591        LOOP: for (int pos = bottom; pos >= upper; pos--) {
592            Element node = stack.get(pos);
593            if (pos == upper) {
594                last = true;
595                if (fragmentParsing)
596                    node = contextElement;
597            }
598            String name = node != null ? node.normalName() : "";
599            if (!NamespaceHtml.equals(node.tag().namespace()))
600                continue; // only looking for HTML elements here
601
602            switch (name) {
603                case "select":
604                    transition(HtmlTreeBuilderState.InSelect);
605                    // todo - should loop up (with some limit) and check for table or template hits
606                    break LOOP;
607                case "td":
608                case "th":
609                    if (!last) {
610                        transition(HtmlTreeBuilderState.InCell);
611                        break LOOP;
612                    }
613                    break;
614                case "tr":
615                    transition(HtmlTreeBuilderState.InRow);
616                    break LOOP;
617                case "tbody":
618                case "thead":
619                case "tfoot":
620                    transition(HtmlTreeBuilderState.InTableBody);
621                    break LOOP;
622                case "caption":
623                    transition(HtmlTreeBuilderState.InCaption);
624                    break LOOP;
625                case "colgroup":
626                    transition(HtmlTreeBuilderState.InColumnGroup);
627                    break LOOP;
628                case "table":
629                    transition(HtmlTreeBuilderState.InTable);
630                    break LOOP;
631                case "template":
632                    HtmlTreeBuilderState tmplState = currentTemplateMode();
633                    Validate.notNull(tmplState, "Bug: no template insertion mode on stack!");
634                    transition(tmplState);
635                    break LOOP;
636                case "head":
637                    if (!last) {
638                        transition(HtmlTreeBuilderState.InHead);
639                        break LOOP;
640                    }
641                    break;
642                case "body":
643                    transition(HtmlTreeBuilderState.InBody);
644                    break LOOP;
645                case "frameset":
646                    transition(HtmlTreeBuilderState.InFrameset);
647                    break LOOP;
648                case "html":
649                    transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead);
650                    break LOOP;
651            }
652            if (last) {
653                transition(HtmlTreeBuilderState.InBody);
654                break;
655            }
656        }
657        return state != origState;
658    }
659
660    /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */
661    void resetBody() {
662        if (!onStack("body")) {
663            stack.add(doc.body()); // not onNodeInserted, as already seen
664        }
665        transition(HtmlTreeBuilderState.InBody);
666    }
667
668    // todo: tidy up in specific scope methods
669    private final String[] specificScopeTarget = {null};
670
671    private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
672        specificScopeTarget[0] = targetName;
673        return inSpecificScope(specificScopeTarget, baseTypes, extraTypes);
674    }
675
676    private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) {
677        // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
678        final int bottom = stack.size() -1;
679        final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
680        // don't walk too far up the tree
681        for (int pos = bottom; pos >= top; pos--) {
682            Element el = stack.get(pos);
683            String elName = el.normalName();
684            // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg:
685            String ns = el.tag().namespace();
686            if (ns.equals(NamespaceHtml)) {
687                if (inSorted(elName, targetNames))
688                    return true;
689                if (inSorted(elName, baseTypes))
690                    return false;
691                if (extraTypes != null && inSorted(elName, extraTypes))
692                    return false;
693            } else if (baseTypes == TagsSearchInScope) {
694                if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath))
695                    return false;
696                if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg))
697                    return false;
698            }
699        }
700        //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes)
701        return false;
702    }
703
704    boolean inScope(String[] targetNames) {
705        return inSpecificScope(targetNames, TagsSearchInScope, null);
706    }
707
708    boolean inScope(String targetName) {
709        return inScope(targetName, null);
710    }
711
712    boolean inScope(String targetName, String[] extras) {
713        return inSpecificScope(targetName, TagsSearchInScope, extras);
714    }
715
716    boolean inListItemScope(String targetName) {
717        return inScope(targetName, TagSearchList);
718    }
719
720    boolean inButtonScope(String targetName) {
721        return inScope(targetName, TagSearchButton);
722    }
723
724    boolean inTableScope(String targetName) {
725        return inSpecificScope(targetName, TagSearchTableScope, null);
726    }
727
728    boolean inSelectScope(String targetName) {
729        for (int pos = stack.size() -1; pos >= 0; pos--) {
730            Element el = stack.get(pos);
731            String elName = el.normalName();
732            if (elName.equals(targetName))
733                return true;
734            if (!inSorted(elName, TagSearchSelectScope)) // all elements except
735                return false;
736        }
737        return false; // nothing left on stack
738    }
739
740    /** Tests if there is some element on the stack that is not in the provided set. */
741    boolean onStackNot(String[] allowedTags) {
742        final int bottom = stack.size() -1;
743        final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
744        // don't walk too far up the tree
745
746        for (int pos = bottom; pos >= top; pos--) {
747            final String elName = stack.get(pos).normalName();
748            if (!inSorted(elName, allowedTags))
749                return true;
750        }
751        return false;
752    }
753
754    void setHeadElement(Element headElement) {
755        this.headElement = headElement;
756    }
757
758    Element getHeadElement() {
759        return headElement;
760    }
761
762    boolean isFosterInserts() {
763        return fosterInserts;
764    }
765
766    void setFosterInserts(boolean fosterInserts) {
767        this.fosterInserts = fosterInserts;
768    }
769
770    @Nullable FormElement getFormElement() {
771        return formElement;
772    }
773
774    void setFormElement(FormElement formElement) {
775        this.formElement = formElement;
776    }
777
778    void resetPendingTableCharacters() {
779        pendingTableCharacters.clear();
780    }
781
782    List<Token.Character> getPendingTableCharacters() {
783        return pendingTableCharacters;
784    }
785
786    void addPendingTableCharacters(Token.Character c) {
787        // make a copy of the token to maintain its state (as Tokens are otherwise reset)
788        Token.Character copy = new Token.Character(c);
789        pendingTableCharacters.add(copy);
790    }
791
792    /**
793     13.2.6.3 Closing elements that have implied end tags
794     When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements.
795
796     If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list.
797
798     When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements.
799
800     @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
801     process, then the UA must perform the above steps as if that element was not in the above list.
802     */
803    void generateImpliedEndTags(String excludeTag) {
804        while (inSorted(currentElement().normalName(), TagSearchEndTags)) {
805            if (excludeTag != null && currentElementIs(excludeTag))
806                break;
807            pop();
808        }
809    }
810
811    void generateImpliedEndTags() {
812        generateImpliedEndTags(false);
813    }
814
815    /**
816     Pops HTML elements off the stack according to the implied end tag rules
817     @param thorough if we are thorough (includes table elements etc) or not
818     */
819    void generateImpliedEndTags(boolean thorough) {
820        final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags;
821        while (NamespaceHtml.equals(currentElement().tag().namespace())
822            && inSorted(currentElement().normalName(), search)) {
823            pop();
824        }
825    }
826
827    void closeElement(String name) {
828        generateImpliedEndTags(name);
829        if (!name.equals(currentElement().normalName())) error(state());
830        popStackToClose(name);
831    }
832
833    static boolean isSpecial(Element el) {
834        String namespace = el.tag().namespace();
835        String name = el.normalName();
836        switch (namespace) {
837            case NamespaceHtml:
838                return inSorted(name, TagSearchSpecial);
839            case Parser.NamespaceMathml:
840                return inSorted(name, TagSearchSpecialMath);
841            case Parser.NamespaceSvg:
842                return inSorted(name, TagSvgHtmlIntegration);
843            default:
844                return false;
845        }
846    }
847
848    Element lastFormattingElement() {
849        return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null;
850    }
851
852    int positionOfElement(Element el){
853        for (int i = 0; i < formattingElements.size(); i++){
854            if (el == formattingElements.get(i))
855                return i;
856        }
857        return -1;
858    }
859
860    Element removeLastFormattingElement() {
861        int size = formattingElements.size();
862        if (size > 0)
863            return formattingElements.remove(size-1);
864        else
865            return null;
866    }
867
868    // active formatting elements
869    void pushActiveFormattingElements(Element in) {
870        checkActiveFormattingElements(in);
871        formattingElements.add(in);
872    }
873
874    void pushWithBookmark(Element in, int bookmark){
875        checkActiveFormattingElements(in);
876        // catch any range errors and assume bookmark is incorrect - saves a redundant range check.
877        try {
878            formattingElements.add(bookmark, in);
879        } catch (IndexOutOfBoundsException e) {
880            formattingElements.add(in);
881        }
882    }
883
884    void checkActiveFormattingElements(Element in){
885        int numSeen = 0;
886        final int size = formattingElements.size() -1;
887        int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0;
888
889        for (int pos = size; pos >= ceil; pos--) {
890            Element el = formattingElements.get(pos);
891            if (el == null) // marker
892                break;
893
894            if (isSameFormattingElement(in, el))
895                numSeen++;
896
897            if (numSeen == 3) {
898                formattingElements.remove(pos);
899                break;
900            }
901        }
902    }
903
904    private static boolean isSameFormattingElement(Element a, Element b) {
905        // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
906        return a.normalName().equals(b.normalName()) &&
907                // a.namespace().equals(b.namespace()) &&
908                a.attributes().equals(b.attributes());
909        // todo: namespaces
910    }
911
912    void reconstructFormattingElements() {
913        if (stack.size() > maxQueueDepth)
914            return;
915        Element last = lastFormattingElement();
916        if (last == null || onStack(last))
917            return;
918
919        Element entry = last;
920        int size = formattingElements.size();
921        int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0;
922        int pos = size - 1;
923        boolean skip = false;
924        while (true) {
925            if (pos == ceil) { // step 4. if none before, skip to 8
926                skip = true;
927                break;
928            }
929            entry = formattingElements.get(--pos); // step 5. one earlier than entry
930            if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
931                break; // jump to 8, else continue back to 4
932        }
933        while(true) {
934            if (!skip) // step 7: on later than entry
935                entry = formattingElements.get(++pos);
936            Validate.notNull(entry); // should not occur, as we break at last element
937
938            // 8. create new element from element, 9 insert into current node, onto stack
939            skip = false; // can only skip increment from 4.
940            Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone());
941            doInsertElement(newEl);
942
943            // 10. replace entry with new entry
944            formattingElements.set(pos, newEl);
945
946            // 11
947            if (pos == size-1) // if not last entry in list, jump to 7
948                break;
949        }
950    }
951    private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated
952
953    void clearFormattingElementsToLastMarker() {
954        while (!formattingElements.isEmpty()) {
955            Element el = removeLastFormattingElement();
956            if (el == null)
957                break;
958        }
959    }
960
961    void removeFromActiveFormattingElements(Element el) {
962        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
963            Element next = formattingElements.get(pos);
964            if (next == el) {
965                formattingElements.remove(pos);
966                break;
967            }
968        }
969    }
970
971    boolean isInActiveFormattingElements(Element el) {
972        return onStack(formattingElements, el);
973    }
974
975    @Nullable
976    Element getActiveFormattingElement(String nodeName) {
977        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
978            Element next = formattingElements.get(pos);
979            if (next == null) // scope marker
980                break;
981            else if (next.nameIs(nodeName))
982                return next;
983        }
984        return null;
985    }
986
987    void replaceActiveFormattingElement(Element out, Element in) {
988        replaceInQueue(formattingElements, out, in);
989    }
990
991    void insertMarkerToFormattingElements() {
992        formattingElements.add(null);
993    }
994
995    void insertInFosterParent(Node in) {
996        Element fosterParent;
997        Element lastTable = getFromStack("table");
998        boolean isLastTableParent = false;
999        if (lastTable != null) {
1000            if (lastTable.parent() != null) {
1001                fosterParent = lastTable.parent();
1002                isLastTableParent = true;
1003            } else
1004                fosterParent = aboveOnStack(lastTable);
1005        } else { // no table == frag
1006            fosterParent = stack.get(0);
1007        }
1008
1009        if (isLastTableParent) {
1010            Validate.notNull(lastTable); // last table cannot be null by this point.
1011            lastTable.before(in);
1012        }
1013        else
1014            fosterParent.appendChild(in);
1015    }
1016
1017    // Template Insertion Mode stack
1018    void pushTemplateMode(HtmlTreeBuilderState state) {
1019        tmplInsertMode.add(state);
1020    }
1021
1022    @Nullable HtmlTreeBuilderState popTemplateMode() {
1023        if (tmplInsertMode.size() > 0) {
1024            return tmplInsertMode.remove(tmplInsertMode.size() -1);
1025        } else {
1026            return null;
1027        }
1028    }
1029
1030    int templateModeSize() {
1031        return tmplInsertMode.size();
1032    }
1033
1034    @Nullable HtmlTreeBuilderState currentTemplateMode() {
1035        return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1)  : null;
1036    }
1037
1038    @Override
1039    public String toString() {
1040        return "TreeBuilder{" +
1041                "currentToken=" + currentToken +
1042                ", state=" + state +
1043                ", currentElement=" + currentElement() +
1044                '}';
1045    }
1046
1047}