001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jspecify.annotations.Nullable;
006
007import java.util.HashMap;
008import java.util.Map;
009import java.util.Objects;
010import java.util.function.Consumer;
011
012import static org.jsoup.parser.Parser.NamespaceHtml;
013import static org.jsoup.parser.Parser.NamespaceMathml;
014import static org.jsoup.parser.Parser.NamespaceSvg;
015
016/**
017 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial
018 defaults, and after the parse, any additionally discovered tags.
019
020 @see Parser#tagSet(TagSet)
021 @since 1.20.1
022 */
023public class TagSet {
024    static final TagSet HtmlTagSet = initHtmlDefault();
025
026    final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
027    final @Nullable TagSet source; // source to pull tags from on demand
028
029    /**
030     Returns a mutable copy of the default HTML tag set.
031     */
032    public static TagSet Html() {
033        return new TagSet(HtmlTagSet);
034    }
035
036    public TagSet() {
037        source = null;
038    }
039
040    public TagSet(TagSet original) {
041        this.source = original;
042    }
043
044    /**
045     Insert a tag into this TagSet. If the tag already exists, it is replaced.
046     <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via
047     .valueOf() if not already in the set.</p>
048
049     @param tag the tag to add
050     @return this TagSet
051     */
052    public TagSet add(Tag tag) {
053        tag.set(Tag.Known);
054        doAdd(tag);
055        return this;
056    }
057
058    /** Adds the tag, but does not set defined. Used in .valueOf */
059    private void doAdd(Tag tag) {
060        tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>())
061            .put(tag.tagName, tag);
062    }
063
064    /**
065     Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed
066     instances.
067
068     @param tagName the case-sensitive tag name
069     @param namespace the namespace
070     @return the tag, or null if not found
071     */
072    public @Nullable Tag get(String tagName, String namespace) {
073        Validate.notNull(tagName);
074        Validate.notNull(namespace);
075
076        // get from our tags
077        Map<String, Tag> nsTags = tags.get(namespace);
078        if (nsTags != null) {
079            Tag tag = nsTags.get(tagName);
080            if (tag != null) {
081                return tag;
082            }
083        }
084
085        // not found; clone on demand from source if exists
086        if (source != null) {
087            Tag tag = source.get(tagName, namespace);
088            if (tag != null) {
089                Tag copy = tag.clone();
090                doAdd(copy);
091                return copy;
092            }
093        }
094
095        return null;
096    }
097
098    /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */
099    Tag valueOf(String tagName, String normalName, String namespace, boolean preserveTagCase) {
100        Validate.notNull(tagName);
101        Validate.notNull(namespace);
102        tagName = tagName.trim();
103        Validate.notEmpty(tagName);
104        Tag tag = get(tagName, namespace);
105        if (tag != null) return tag;
106
107        // not found by tagName, try by normal
108        tagName = preserveTagCase ? tagName : normalName;
109        tag = get(normalName, namespace);
110        if (tag != null) {
111            if (preserveTagCase && !tagName.equals(normalName)) {
112                tag = tag.clone(); // copy so that the name update doesn't reset all instances
113                tag.tagName = tagName;
114                doAdd(tag);
115            }
116            return tag;
117        }
118
119        // not defined: return a new one
120        tag = new Tag(tagName, normalName, namespace);
121        doAdd(tag);
122
123        return tag;
124    }
125
126    /**
127     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
128     <p>New tags will be added to this TagSet.</p>
129
130     @param tagName Name of tag, e.g. "p".
131     @param namespace the namespace for the tag.
132     @param settings used to control tag name sensitivity
133     @return The tag, either defined or new generic.
134     */
135    public Tag valueOf(String tagName, String namespace, ParseSettings settings) {
136        return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase());
137    }
138
139    /**
140     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
141     <p>New tags will be added to this TagSet.</p>
142
143     @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>.
144     @param namespace the namespace for the tag.
145     @return The tag, either defined or new generic.
146     @see #valueOf(String tagName, String namespace, ParseSettings settings)
147     */
148    public Tag valueOf(String tagName, String namespace) {
149        return valueOf(tagName, namespace, ParseSettings.preserveCase);
150    }
151
152    @Override
153    public boolean equals(Object o) {
154        if (!(o instanceof TagSet)) return false;
155        TagSet tagSet = (TagSet) o;
156        return Objects.equals(tags, tagSet.tags);
157    }
158
159    @Override
160    public int hashCode() {
161        return Objects.hashCode(tags);
162    }
163
164    // Default HTML initialization
165
166    /**
167     Initialize the default HTML tag set.
168     */
169    static TagSet initHtmlDefault() {
170        String[] blockTags = {
171            "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
172            "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5",
173            "h6", "br", "button",
174            "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
175            "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
176            "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
177            "center", "template",
178            "dir", "applet", "marquee", "listing", // deprecated but still known / special handling
179            "#root" // the outer Document
180        };
181        String[] inlineTags = {
182            "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
183            "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map",
184            "q",
185            "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup",
186            "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
187            "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
188            "data", "bdi", "s", "strike", "nobr",
189            "rb", // deprecated but still known / special handling
190        };
191        String[] inlineContainers = { // can only contain inline; aka phrasing content
192            "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",
193            "ins", "del", "s", "button"
194        };
195        String[] voidTags = {
196            "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
197            "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
198        };
199        String[] preserveWhitespaceTags = {
200            "pre", "plaintext", "title", "textarea", "script"
201        };
202        String[] rcdataTags = { "title", "textarea" };
203        String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" };
204        String[] formSubmitTags = SharedConstants.FormSubmitTags;
205        String[] blockMathTags = {"math"};
206        String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"};
207        String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case
208        String[] inlineSvgTags = {"text"};
209
210        return new TagSet()
211            .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block))
212            .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0))
213            .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer))
214            .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void))
215            .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace))
216            .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData))
217            .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data))
218            .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable))
219            .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block))
220            .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0))
221            .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block))
222            .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0))
223            ;
224    }
225
226    private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) {
227        for (String tagName : tagNames) {
228            Tag tag = get(tagName, namespace);
229            if (tag == null) {
230                tag = new Tag(tagName, tagName, namespace); // normal name is already normal here
231                tag.options = 0; // clear defaults
232                add(tag);
233            }
234            tagModifier.accept(tag);
235        }
236        return this;
237    }
238}