001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.Normalizer;
005import org.jsoup.internal.StringUtil;
006import org.jsoup.nodes.Attributes;
007import org.jsoup.nodes.CDataNode;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.DataNode;
010import org.jsoup.nodes.Document;
011import org.jsoup.nodes.Element;
012import org.jsoup.nodes.FormElement;
013import org.jsoup.nodes.Node;
014import org.jsoup.nodes.TextNode;
015import org.jspecify.annotations.Nullable;
016
017import java.io.Reader;
018import java.util.ArrayList;
019import java.util.List;
020
021import static org.jsoup.internal.StringUtil.inSorted;
022import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
023import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;
024import static org.jsoup.parser.Parser.*;
025
026/**
027 * HTML Tree Builder; creates a DOM from Tokens.
028 */
029public class HtmlTreeBuilder extends TreeBuilder {
030    // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted.
031    static final String[] TagsSearchInScope = new String[]{ // a particular element in scope
032        "applet", "caption", "html", "marquee", "object", "table", "td", "template", "th"
033    };
034    // math and svg namespaces for particular element in scope
035    static final String[]TagSearchInScopeMath = new String[] {
036        "annotation-xml",  "mi", "mn", "mo", "ms", "mtext"
037    };
038    static final String[]TagSearchInScopeSvg = new String[] {
039        "desc", "foreignObject", "title"
040    };
041
042    static final String[] TagSearchList = new String[]{"ol", "ul"};
043    static final String[] TagSearchButton = new String[]{"button"};
044    static final String[] TagSearchTableScope = new String[]{"html", "table"};
045    static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"};
046    static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"};
047    static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
048    static final String[] TagSearchSpecial = new String[]{
049        "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br",
050        "button", "caption", "center", "col", "colgroup", "dd", "details", "dir", "div", "dl", "dt", "embed",
051        "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6",
052        "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "keygen", "li", "link", "listing", "main",
053        "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext",
054        "pre", "script", "search", "section", "select", "source", "style", "summary", "table", "tbody", "td",
055        "template", "textarea", "tfoot", "th", "thead", "title", "tr", "track", "ul", "wbr", "xmp"};
056    static String[] TagSearchSpecialMath = {"annotation-xml", "mi", "mn", "mo", "ms", "mtext"}; // differs to MathML text integration point; adds annotation-xml
057    static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
058    static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};
059
060    public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
061
062    private HtmlTreeBuilderState state; // the current state
063    private HtmlTreeBuilderState originalState; // original / marked state
064
065    private boolean baseUriSetFromDoc;
066    private @Nullable Element headElement; // the current head element
067    private @Nullable FormElement formElement; // the current form element
068    private @Nullable Element contextElement; // fragment parse root; name only copy of context. could be null even if fragment parsing
069    ArrayList<Element> formattingElements; // active (open) formatting elements
070    private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes
071    private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out
072    private Token.EndTag emptyEnd; // reused empty end tag
073
074    private boolean framesetOk; // if ok to go into frameset
075    private boolean fosterInserts; // if next inserts should be fostered
076    private boolean fragmentParsing; // if parsing a fragment of html
077
078    @Override ParseSettings defaultSettings() {
079        return ParseSettings.htmlDefault;
080    }
081
082    @Override
083    HtmlTreeBuilder newInstance() {
084        return new HtmlTreeBuilder();
085    }
086
087    @Override
088    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
089        super.initialiseParse(input, baseUri, parser);
090
091        // this is a bit mucky. todo - probably just create new parser objects to ensure all reset.
092        state = HtmlTreeBuilderState.Initial;
093        originalState = null;
094        baseUriSetFromDoc = false;
095        headElement = null;
096        formElement = null;
097        contextElement = null;
098        formattingElements = new ArrayList<>();
099        tmplInsertMode = new ArrayList<>();
100        pendingTableCharacters = new ArrayList<>();
101        emptyEnd = new Token.EndTag(this);
102        framesetOk = true;
103        fosterInserts = false;
104        fragmentParsing = false;
105    }
106
107    @Override void initialiseParseFragment(@Nullable Element context) {
108        // context may be null
109        state = HtmlTreeBuilderState.Initial;
110        fragmentParsing = true;
111
112        if (context != null) {
113            final String contextName = context.normalName();
114            contextElement = new Element(tagFor(contextName, contextName, defaultNamespace(), settings), baseUri);
115            if (context.ownerDocument() != null) // quirks setup:
116                doc.quirksMode(context.ownerDocument().quirksMode());
117
118            // initialise the tokeniser state:
119            switch (contextName) {
120                case "title":
121                case "textarea":
122                    tokeniser.transition(TokeniserState.Rcdata);
123                    break;
124                case "iframe":
125                case "noembed":
126                case "noframes":
127                case "style":
128                case "xmp":
129                    tokeniser.transition(TokeniserState.Rawtext);
130                    break;
131                case "script":
132                    tokeniser.transition(TokeniserState.ScriptData);
133                    break;
134                case "plaintext":
135                    tokeniser.transition(TokeniserState.PLAINTEXT);
136                    break;
137                case "template":
138                    tokeniser.transition(TokeniserState.Data);
139                    pushTemplateMode(HtmlTreeBuilderState.InTemplate);
140                    break;
141                default:
142                    tokeniser.transition(TokeniserState.Data);
143            }
144            doc.appendChild(contextElement);
145            push(contextElement);
146            resetInsertionMode();
147
148            // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated
149            // with form correctly
150            Element formSearch = context;
151            while (formSearch != null) {
152                if (formSearch instanceof FormElement) {
153                    formElement = (FormElement) formSearch;
154                    break;
155                }
156                formSearch = formSearch.parent();
157            }
158        }
159    }
160
161    @Override List<Node> completeParseFragment() {
162        if (contextElement != null) {
163            // depending on context and the input html, content may have been added outside of the root el
164            // e.g. context=p, input=div, the div will have been pushed out.
165            List<Node> nodes = contextElement.siblingNodes();
166            if (!nodes.isEmpty())
167                contextElement.insertChildren(-1, nodes);
168            return contextElement.childNodes();
169        }
170        else
171            return doc.childNodes();
172    }
173
174    @Override
175    protected boolean process(Token token) {
176        HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent;
177        return dispatch.process(token, this);
178    }
179
180    boolean useCurrentOrForeignInsert(Token token) {
181        // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
182        // If the stack of open elements is empty
183        if (stack.isEmpty())
184            return true;
185        final Element el = currentElement();
186        final String ns = el.tag().namespace();
187
188        // If the adjusted current node is an element in the HTML namespace
189        if (NamespaceHtml.equals(ns))
190            return true;
191
192        // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
193        // If the adjusted current node is a MathML text integration point and the token is a character token
194        if (isMathmlTextIntegration(el)) {
195            if (token.isStartTag()
196                    && !"mglyph".equals(token.asStartTag().normalName)
197                    && !"malignmark".equals(token.asStartTag().normalName))
198                    return true;
199            if (token.isCharacter())
200                    return true;
201        }
202        // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
203        if (Parser.NamespaceMathml.equals(ns)
204            && el.nameIs("annotation-xml")
205            && token.isStartTag()
206            && "svg".equals(token.asStartTag().normalName))
207            return true;
208
209        // If the adjusted current node is an HTML integration point and the token is a start tag
210        // If the adjusted current node is an HTML integration point and the token is a character token
211        if (isHtmlIntegration(el)
212            && (token.isStartTag() || token.isCharacter()))
213            return true;
214
215        // If the token is an end-of-file token
216        return token.isEOF();
217    }
218
219    static boolean isMathmlTextIntegration(Element el) {
220        /*
221        A node is a MathML text integration point if it is one of the following elements:
222        A MathML mi element
223        A MathML mo element
224        A MathML mn element
225        A MathML ms element
226        A MathML mtext element
227         */
228        return (Parser.NamespaceMathml.equals(el.tag().namespace())
229            && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
230    }
231
232    static boolean isHtmlIntegration(Element el) {
233        /*
234        A node is an HTML integration point if it is one of the following elements:
235        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
236        A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
237        An SVG foreignObject element
238        An SVG desc element
239        An SVG title element
240         */
241        if (Parser.NamespaceMathml.equals(el.tag().namespace())
242            && el.nameIs("annotation-xml")) {
243            String encoding = Normalizer.normalize(el.attr("encoding"));
244            if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
245                return true;
246        }
247        // note using .tagName for case-sensitive hit here of foreignObject
248        return Parser.NamespaceSvg.equals(el.tag().namespace()) && StringUtil.in(el.tagName(), TagSvgHtmlIntegration);
249    }
250
251    boolean process(Token token, HtmlTreeBuilderState state) {
252        return state.process(token, this);
253    }
254
255    void transition(HtmlTreeBuilderState state) {
256        this.state = state;
257    }
258
259    HtmlTreeBuilderState state() {
260        return state;
261    }
262
263    void markInsertionMode() {
264        originalState = state;
265    }
266
267    HtmlTreeBuilderState originalState() {
268        return originalState;
269    }
270
271    void framesetOk(boolean framesetOk) {
272        this.framesetOk = framesetOk;
273    }
274
275    boolean framesetOk() {
276        return framesetOk;
277    }
278
279    Document getDocument() {
280        return doc;
281    }
282
283    String getBaseUri() {
284        return baseUri;
285    }
286
287    void maybeSetBaseUri(Element base) {
288        if (baseUriSetFromDoc) // only listen to the first <base href> in parse
289            return;
290
291        String href = base.absUrl("href");
292        if (href.length() != 0) { // ignore <base target> etc
293            baseUri = href;
294            baseUriSetFromDoc = true;
295            doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants
296        }
297    }
298
299    boolean isFragmentParsing() {
300        return fragmentParsing;
301    }
302
303    void error(HtmlTreeBuilderState state) {
304        if (parser.getErrors().canAddError())
305            parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]",
306                currentToken.tokenType(), currentToken, state));
307    }
308
309    Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) {
310        // dedupe and normalize the attributes:
311        Attributes attributes = startTag.attributes;
312        if (!forcePreserveCase)
313            attributes = settings.normalizeAttributes(attributes);
314        if (attributes != null && !attributes.isEmpty()) {
315            int dupes = attributes.deduplicate(settings);
316            if (dupes > 0) {
317                error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName);
318            }
319        }
320
321        Tag tag = tagFor(startTag.tagName, startTag.normalName, namespace,
322            forcePreserveCase ? ParseSettings.preserveCase : settings);
323
324        return (tag.normalName().equals("form")) ?
325            new FormElement(tag, null, attributes) :
326            new Element(tag, null, attributes);
327    }
328
329    /** Inserts an HTML element for the given tag) */
330    Element insertElementFor(final Token.StartTag startTag) {
331        Element el = createElementFor(startTag, NamespaceHtml, false);
332        doInsertElement(el, startTag);
333
334        // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag.
335        if (startTag.isSelfClosing()) {
336            Tag tag = el.tag();
337            if (tag.isKnownTag()) {
338                if (!tag.isEmpty())
339                    tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
340                // else: ok
341            }
342            else { // unknown tag: remember this is self-closing, for output
343                tag.setSelfClosing();
344            }
345
346            // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
347            tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data
348            tokeniser.emit(emptyEnd.reset().name(el.tagName()));  // ensure we get out of whatever state we are in. emitted for yielded processing
349        }
350
351        return el;
352    }
353
354    /**
355     Inserts a foreign element. Preserves the case of the tag name and of the attributes.
356     */
357    Element insertForeignElementFor(final Token.StartTag startTag, String namespace) {
358        Element el = createElementFor(startTag, namespace, true);
359        doInsertElement(el, startTag);
360
361        if (startTag.isSelfClosing()) {
362            el.tag().setSelfClosing(); // remember this is self-closing for output
363            pop();
364        }
365
366        return el;
367    }
368
369    Element insertEmptyElementFor(Token.StartTag startTag) {
370        Element el = createElementFor(startTag, NamespaceHtml, false);
371        doInsertElement(el, startTag);
372        pop();
373        return el;
374    }
375
376    FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) {
377        FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false);
378
379        if (checkTemplateStack) {
380            if(!onStack("template"))
381                setFormElement(el);
382        } else
383            setFormElement(el);
384
385        doInsertElement(el, startTag);
386        if (!onStack) pop();
387        return el;
388    }
389
390    /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general
391     tests on the Element before insertion.
392     * @param el the Element to insert and make the current element
393     * @param token the token this element was parsed from. If null, uses a zero-width current token as intrinsic insert
394     */
395    private void doInsertElement(Element el, @Nullable Token token) {
396        if (el.tag().isFormListed() && formElement != null)
397            formElement.addElement(el); // connect form controls to their form element
398
399        // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to
400        if (parser.getErrors().canAddError() && el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace()))
401            error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName());
402
403        if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster))
404            insertInFosterParent(el);
405        else
406            currentElement().appendChild(el);
407
408        push(el);
409    }
410
411    void insertCommentNode(Token.Comment token) {
412        Comment node = new Comment(token.getData());
413        currentElement().appendChild(node);
414        onNodeInserted(node);
415    }
416
417    /** Inserts the provided character token into the current element. */
418    void insertCharacterNode(Token.Character characterToken) {
419        Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
420        insertCharacterToElement(characterToken, el);
421    }
422
423    /** Inserts the provided character token into the provided element. */
424    void insertCharacterToElement(Token.Character characterToken, Element el) {
425        final Node node;
426        final String tagName = el.normalName();
427        final String data = characterToken.getData();
428
429        if (characterToken.isCData())
430            node = new CDataNode(data);
431        else if (isContentForTagData(tagName))
432            node = new DataNode(data);
433        else
434            node = new TextNode(data);
435        el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack.
436        onNodeInserted(node);
437    }
438
439    ArrayList<Element> getStack() {
440        return stack;
441    }
442
443    boolean onStack(Element el) {
444        return onStack(stack, el);
445    }
446
447    /** Checks if there is an HTML element with the given name on the stack. */
448    boolean onStack(String elName) {
449        return getFromStack(elName) != null;
450    }
451
452    private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain
453    private static boolean onStack(ArrayList<Element> queue, Element element) {
454        final int bottom = queue.size() - 1;
455        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
456        for (int pos = bottom; pos >= upper; pos--) {
457            Element next = queue.get(pos);
458            if (next == element) {
459                return true;
460            }
461        }
462        return false;
463    }
464
465    /** Gets the nearest (lowest) HTML element with the given name from the stack. */
466    @Nullable
467    Element getFromStack(String elName) {
468        final int bottom = stack.size() - 1;
469        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
470        for (int pos = bottom; pos >= upper; pos--) {
471            Element next = stack.get(pos);
472            if (next.elementIs(elName, NamespaceHtml)) {
473                return next;
474            }
475        }
476        return null;
477    }
478
479    boolean removeFromStack(Element el) {
480        for (int pos = stack.size() -1; pos >= 0; pos--) {
481            Element next = stack.get(pos);
482            if (next == el) {
483                stack.remove(pos);
484                onNodeClosed(el);
485                return true;
486            }
487        }
488        return false;
489    }
490
491    /** Pops the stack until the given HTML element is removed. */
492    @Nullable
493    Element popStackToClose(String elName) {
494        for (int pos = stack.size() -1; pos >= 0; pos--) {
495            Element el = pop();
496            if (el.elementIs(elName, NamespaceHtml)) {
497                return el;
498            }
499        }
500        return null;
501    }
502
503    /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */
504    @Nullable
505    Element popStackToCloseAnyNamespace(String elName) {
506        for (int pos = stack.size() -1; pos >= 0; pos--) {
507            Element el = pop();
508            if (el.nameIs(elName)) {
509                return el;
510            }
511        }
512        return null;
513    }
514
515    /** Pops the stack until one of the given HTML elements is removed. */
516    void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants
517        for (int pos = stack.size() -1; pos >= 0; pos--) {
518            Element el = pop();
519            if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) {
520                break;
521            }
522        }
523    }
524
525    void clearStackToTableContext() {
526        clearStackToContext("table", "template");
527    }
528
529    void clearStackToTableBodyContext() {
530        clearStackToContext("tbody", "tfoot", "thead", "template");
531    }
532
533    void clearStackToTableRowContext() {
534        clearStackToContext("tr", "template");
535    }
536
537    /** Removes elements from the stack until one of the supplied HTML elements is removed. */
538    private void clearStackToContext(String... nodeNames) {
539        for (int pos = stack.size() -1; pos >= 0; pos--) {
540            Element next = stack.get(pos);
541            if (NamespaceHtml.equals(next.tag().namespace()) &&
542                (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html")))
543                break;
544            else
545                pop();
546        }
547    }
548
549    /**
550     Gets the Element immediately above the supplied element on the stack. Which due to adoption, may not necessarily be
551     its parent.
552
553     @param el
554     @return the Element immediately above the supplied element, or null if there is no such element.
555     */
556    @Nullable Element aboveOnStack(Element el) {
557        assert onStack(el);
558        for (int pos = stack.size() -1; pos >= 0; pos--) {
559            Element next = stack.get(pos);
560            if (next == el) {
561                return stack.get(pos-1);
562            }
563        }
564        return null;
565    }
566
567    void insertOnStackAfter(Element after, Element in) {
568        int i = stack.lastIndexOf(after);
569        Validate.isTrue(i != -1);
570        stack.add(i+1, in);
571    }
572
573    void replaceOnStack(Element out, Element in) {
574        replaceInQueue(stack, out, in);
575    }
576
577    private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) {
578        int i = queue.lastIndexOf(out);
579        Validate.isTrue(i != -1);
580        queue.set(i, in);
581    }
582
583    /**
584     * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth
585     * is limited to {@link #maxQueueDepth}.
586     * @return true if the insertion mode was actually changed.
587     */
588    boolean resetInsertionMode() {
589        // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode
590        boolean last = false;
591        final int bottom = stack.size() - 1;
592        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
593        final HtmlTreeBuilderState origState = this.state;
594
595        if (stack.size() == 0) { // nothing left of stack, just get to body
596            transition(HtmlTreeBuilderState.InBody);
597        }
598
599        LOOP: for (int pos = bottom; pos >= upper; pos--) {
600            Element node = stack.get(pos);
601            if (pos == upper) {
602                last = true;
603                if (fragmentParsing)
604                    node = contextElement;
605            }
606            String name = node != null ? node.normalName() : "";
607            if (!NamespaceHtml.equals(node.tag().namespace()))
608                continue; // only looking for HTML elements here
609
610            switch (name) {
611                case "select":
612                    transition(HtmlTreeBuilderState.InSelect);
613                    // todo - should loop up (with some limit) and check for table or template hits
614                    break LOOP;
615                case "td":
616                case "th":
617                    if (!last) {
618                        transition(HtmlTreeBuilderState.InCell);
619                        break LOOP;
620                    }
621                    break;
622                case "tr":
623                    transition(HtmlTreeBuilderState.InRow);
624                    break LOOP;
625                case "tbody":
626                case "thead":
627                case "tfoot":
628                    transition(HtmlTreeBuilderState.InTableBody);
629                    break LOOP;
630                case "caption":
631                    transition(HtmlTreeBuilderState.InCaption);
632                    break LOOP;
633                case "colgroup":
634                    transition(HtmlTreeBuilderState.InColumnGroup);
635                    break LOOP;
636                case "table":
637                    transition(HtmlTreeBuilderState.InTable);
638                    break LOOP;
639                case "template":
640                    HtmlTreeBuilderState tmplState = currentTemplateMode();
641                    Validate.notNull(tmplState, "Bug: no template insertion mode on stack!");
642                    transition(tmplState);
643                    break LOOP;
644                case "head":
645                    if (!last) {
646                        transition(HtmlTreeBuilderState.InHead);
647                        break LOOP;
648                    }
649                    break;
650                case "body":
651                    transition(HtmlTreeBuilderState.InBody);
652                    break LOOP;
653                case "frameset":
654                    transition(HtmlTreeBuilderState.InFrameset);
655                    break LOOP;
656                case "html":
657                    transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead);
658                    break LOOP;
659            }
660            if (last) {
661                transition(HtmlTreeBuilderState.InBody);
662                break;
663            }
664        }
665        return state != origState;
666    }
667
668    /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */
669    void resetBody() {
670        if (!onStack("body")) {
671            stack.add(doc.body()); // not onNodeInserted, as already seen
672        }
673        transition(HtmlTreeBuilderState.InBody);
674    }
675
676    // todo: tidy up in specific scope methods
677    private final String[] specificScopeTarget = {null};
678
679    private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) {
680        specificScopeTarget[0] = targetName;
681        return inSpecificScope(specificScopeTarget, baseTypes, extraTypes);
682    }
683
684    private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) {
685        // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
686        final int bottom = stack.size() -1;
687        final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
688        // don't walk too far up the tree
689        for (int pos = bottom; pos >= top; pos--) {
690            Element el = stack.get(pos);
691            String elName = el.normalName();
692            // namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg:
693            String ns = el.tag().namespace();
694            if (ns.equals(NamespaceHtml)) {
695                if (inSorted(elName, targetNames))
696                    return true;
697                if (inSorted(elName, baseTypes))
698                    return false;
699                if (extraTypes != null && inSorted(elName, extraTypes))
700                    return false;
701            } else if (baseTypes == TagsSearchInScope) {
702                if (ns.equals(NamespaceMathml) && inSorted(elName, TagSearchInScopeMath))
703                    return false;
704                if (ns.equals(NamespaceSvg) && inSorted(elName, TagSearchInScopeSvg))
705                    return false;
706            }
707        }
708        //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes)
709        return false;
710    }
711
712    boolean inScope(String[] targetNames) {
713        return inSpecificScope(targetNames, TagsSearchInScope, null);
714    }
715
716    boolean inScope(String targetName) {
717        return inScope(targetName, null);
718    }
719
720    boolean inScope(String targetName, String[] extras) {
721        return inSpecificScope(targetName, TagsSearchInScope, extras);
722        // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml
723        // todo: in svg namespace: forignOjbect, desc, title
724    }
725
726    boolean inListItemScope(String targetName) {
727        return inScope(targetName, TagSearchList);
728    }
729
730    boolean inButtonScope(String targetName) {
731        return inScope(targetName, TagSearchButton);
732    }
733
734    boolean inTableScope(String targetName) {
735        return inSpecificScope(targetName, TagSearchTableScope, null);
736    }
737
738    boolean inSelectScope(String targetName) {
739        for (int pos = stack.size() -1; pos >= 0; pos--) {
740            Element el = stack.get(pos);
741            String elName = el.normalName();
742            if (elName.equals(targetName))
743                return true;
744            if (!inSorted(elName, TagSearchSelectScope)) // all elements except
745                return false;
746        }
747        Validate.fail("Should not be reachable");
748        return false;
749    }
750
751    /** Tests if there is some element on the stack that is not in the provided set. */
752    boolean onStackNot(String[] allowedTags) {
753        final int bottom = stack.size() -1;
754        final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
755        // don't walk too far up the tree
756
757        for (int pos = bottom; pos >= top; pos--) {
758            final String elName = stack.get(pos).normalName();
759            if (!inSorted(elName, allowedTags))
760                return true;
761        }
762        return false;
763    }
764
765    void setHeadElement(Element headElement) {
766        this.headElement = headElement;
767    }
768
769    Element getHeadElement() {
770        return headElement;
771    }
772
773    boolean isFosterInserts() {
774        return fosterInserts;
775    }
776
777    void setFosterInserts(boolean fosterInserts) {
778        this.fosterInserts = fosterInserts;
779    }
780
781    @Nullable FormElement getFormElement() {
782        return formElement;
783    }
784
785    void setFormElement(FormElement formElement) {
786        this.formElement = formElement;
787    }
788
789    void resetPendingTableCharacters() {
790        pendingTableCharacters.clear();
791    }
792
793    List<Token.Character> getPendingTableCharacters() {
794        return pendingTableCharacters;
795    }
796
797    void addPendingTableCharacters(Token.Character c) {
798        // make a clone of the token to maintain its state (as Tokens are otherwise reset)
799        Token.Character clone = c.clone();
800        pendingTableCharacters.add(clone);
801    }
802
803    /**
804     13.2.6.3 Closing elements that have implied end tags
805     When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements.
806
807     If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list.
808
809     When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements.
810
811     @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the
812     process, then the UA must perform the above steps as if that element was not in the above list.
813     */
814    void generateImpliedEndTags(String excludeTag) {
815        while (inSorted(currentElement().normalName(), TagSearchEndTags)) {
816            if (excludeTag != null && currentElementIs(excludeTag))
817                break;
818            pop();
819        }
820    }
821
822    void generateImpliedEndTags() {
823        generateImpliedEndTags(false);
824    }
825
826    /**
827     Pops HTML elements off the stack according to the implied end tag rules
828     @param thorough if we are thorough (includes table elements etc) or not
829     */
830    void generateImpliedEndTags(boolean thorough) {
831        final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags;
832        while (NamespaceHtml.equals(currentElement().tag().namespace())
833            && inSorted(currentElement().normalName(), search)) {
834            pop();
835        }
836    }
837
838    void closeElement(String name) {
839        generateImpliedEndTags(name);
840        if (!name.equals(currentElement().normalName())) error(state());
841        popStackToClose(name);
842    }
843
844    static boolean isSpecial(Element el) {
845        String namespace = el.tag().namespace();
846        String name = el.normalName();
847        switch (namespace) {
848            case NamespaceHtml:
849                return inSorted(name, TagSearchSpecial);
850            case Parser.NamespaceMathml:
851                return inSorted(name, TagSearchSpecialMath);
852            case Parser.NamespaceSvg:
853                return inSorted(name, TagSvgHtmlIntegration);
854            default:
855                return false;
856        }
857    }
858
859    Element lastFormattingElement() {
860        return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null;
861    }
862
863    int positionOfElement(Element el){
864        for (int i = 0; i < formattingElements.size(); i++){
865            if (el == formattingElements.get(i))
866                return i;
867        }
868        return -1;
869    }
870
871    Element removeLastFormattingElement() {
872        int size = formattingElements.size();
873        if (size > 0)
874            return formattingElements.remove(size-1);
875        else
876            return null;
877    }
878
879    // active formatting elements
880    void pushActiveFormattingElements(Element in) {
881        checkActiveFormattingElements(in);
882        formattingElements.add(in);
883    }
884
885    void pushWithBookmark(Element in, int bookmark){
886        checkActiveFormattingElements(in);
887        // catch any range errors and assume bookmark is incorrect - saves a redundant range check.
888        try {
889            formattingElements.add(bookmark, in);
890        } catch (IndexOutOfBoundsException e) {
891            formattingElements.add(in);
892        }
893    }
894
895    void checkActiveFormattingElements(Element in){
896        int numSeen = 0;
897        final int size = formattingElements.size() -1;
898        int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0;
899
900        for (int pos = size; pos >= ceil; pos--) {
901            Element el = formattingElements.get(pos);
902            if (el == null) // marker
903                break;
904
905            if (isSameFormattingElement(in, el))
906                numSeen++;
907
908            if (numSeen == 3) {
909                formattingElements.remove(pos);
910                break;
911            }
912        }
913    }
914
915    private static boolean isSameFormattingElement(Element a, Element b) {
916        // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children
917        return a.normalName().equals(b.normalName()) &&
918                // a.namespace().equals(b.namespace()) &&
919                a.attributes().equals(b.attributes());
920        // todo: namespaces
921    }
922
923    void reconstructFormattingElements() {
924        if (stack.size() > maxQueueDepth)
925            return;
926        Element last = lastFormattingElement();
927        if (last == null || onStack(last))
928            return;
929
930        Element entry = last;
931        int size = formattingElements.size();
932        int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0;
933        int pos = size - 1;
934        boolean skip = false;
935        while (true) {
936            if (pos == ceil) { // step 4. if none before, skip to 8
937                skip = true;
938                break;
939            }
940            entry = formattingElements.get(--pos); // step 5. one earlier than entry
941            if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack
942                break; // jump to 8, else continue back to 4
943        }
944        while(true) {
945            if (!skip) // step 7: on later than entry
946                entry = formattingElements.get(++pos);
947            Validate.notNull(entry); // should not occur, as we break at last element
948
949            // 8. create new element from element, 9 insert into current node, onto stack
950            skip = false; // can only skip increment from 4.
951            Element newEl = new Element(tagFor(entry.nodeName(), entry.normalName(), defaultNamespace(), settings), null, entry.attributes().clone());
952            doInsertElement(newEl, null);
953
954            // 10. replace entry with new entry
955            formattingElements.set(pos, newEl);
956
957            // 11
958            if (pos == size-1) // if not last entry in list, jump to 7
959                break;
960        }
961    }
962    private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated
963
964    void clearFormattingElementsToLastMarker() {
965        while (!formattingElements.isEmpty()) {
966            Element el = removeLastFormattingElement();
967            if (el == null)
968                break;
969        }
970    }
971
972    void removeFromActiveFormattingElements(Element el) {
973        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
974            Element next = formattingElements.get(pos);
975            if (next == el) {
976                formattingElements.remove(pos);
977                break;
978            }
979        }
980    }
981
982    boolean isInActiveFormattingElements(Element el) {
983        return onStack(formattingElements, el);
984    }
985
986    @Nullable
987    Element getActiveFormattingElement(String nodeName) {
988        for (int pos = formattingElements.size() -1; pos >= 0; pos--) {
989            Element next = formattingElements.get(pos);
990            if (next == null) // scope marker
991                break;
992            else if (next.nameIs(nodeName))
993                return next;
994        }
995        return null;
996    }
997
998    void replaceActiveFormattingElement(Element out, Element in) {
999        replaceInQueue(formattingElements, out, in);
1000    }
1001
1002    void insertMarkerToFormattingElements() {
1003        formattingElements.add(null);
1004    }
1005
1006    void insertInFosterParent(Node in) {
1007        Element fosterParent;
1008        Element lastTable = getFromStack("table");
1009        boolean isLastTableParent = false;
1010        if (lastTable != null) {
1011            if (lastTable.parent() != null) {
1012                fosterParent = lastTable.parent();
1013                isLastTableParent = true;
1014            } else
1015                fosterParent = aboveOnStack(lastTable);
1016        } else { // no table == frag
1017            fosterParent = stack.get(0);
1018        }
1019
1020        if (isLastTableParent) {
1021            Validate.notNull(lastTable); // last table cannot be null by this point.
1022            lastTable.before(in);
1023        }
1024        else
1025            fosterParent.appendChild(in);
1026    }
1027
1028    // Template Insertion Mode stack
1029    void pushTemplateMode(HtmlTreeBuilderState state) {
1030        tmplInsertMode.add(state);
1031    }
1032
1033    @Nullable HtmlTreeBuilderState popTemplateMode() {
1034        if (tmplInsertMode.size() > 0) {
1035            return tmplInsertMode.remove(tmplInsertMode.size() -1);
1036        } else {
1037            return null;
1038        }
1039    }
1040
1041    int templateModeSize() {
1042        return tmplInsertMode.size();
1043    }
1044
1045    @Nullable HtmlTreeBuilderState currentTemplateMode() {
1046        return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1)  : null;
1047    }
1048
1049    @Override
1050    public String toString() {
1051        return "TreeBuilder{" +
1052                "currentToken=" + currentToken +
1053                ", state=" + state +
1054                ", currentElement=" + currentElement() +
1055                '}';
1056    }
1057
1058    @Override protected boolean isContentForTagData(final String normalName) {
1059        return (normalName.equals("script") || normalName.equals("style"));
1060    }
1061}