001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jspecify.annotations.Nullable;
006
007import java.util.ArrayList;
008import java.util.HashMap;
009import java.util.Map;
010import java.util.Objects;
011import java.util.function.Consumer;
012
013import static org.jsoup.parser.Parser.NamespaceHtml;
014import static org.jsoup.parser.Parser.NamespaceMathml;
015import static org.jsoup.parser.Parser.NamespaceSvg;
016
017/**
018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial
019 defaults, and after the parse, any additionally discovered tags.
020
021 @see Parser#tagSet(TagSet)
022 @since 1.20.1
023 */
024public class TagSet {
025    static final TagSet HtmlTagSet = initHtmlDefault();
026
027    private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
028    private final @Nullable TagSet source; // source to pull tags from on demand
029    private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer
030
031    /**
032     Returns a mutable copy of the default HTML tag set.
033     */
034    public static TagSet Html() {
035        return new TagSet(HtmlTagSet);
036    }
037
038    public TagSet() {
039        source = null;
040    }
041
042    public TagSet(TagSet original) {
043        this.source = original;
044    }
045
046    /**
047     Insert a tag into this TagSet. If the tag already exists, it is replaced.
048     <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via
049     .valueOf() if not already in the set.</p>
050
051     @param tag the tag to add
052     @return this TagSet
053     */
054    public TagSet add(Tag tag) {
055        tag.set(Tag.Known);
056        doAdd(tag);
057        return this;
058    }
059
060    /** Adds the tag, but does not set defined. Used in .valueOf */
061    private void doAdd(Tag tag) {
062        if (customizers != null) {
063            for (Consumer<Tag> customizer : customizers) {
064                customizer.accept(tag);
065            }
066        }
067
068        tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>())
069            .put(tag.tagName, tag);
070    }
071
072    /**
073     Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed
074     instances.
075
076     @param tagName the case-sensitive tag name
077     @param namespace the namespace
078     @return the tag, or null if not found
079     */
080    public @Nullable Tag get(String tagName, String namespace) {
081        Validate.notNull(tagName);
082        Validate.notNull(namespace);
083
084        // get from our tags
085        Map<String, Tag> nsTags = tags.get(namespace);
086        if (nsTags != null) {
087            Tag tag = nsTags.get(tagName);
088            if (tag != null) {
089                return tag;
090            }
091        }
092
093        // not found; clone on demand from source if exists
094        if (source != null) {
095            Tag tag = source.get(tagName, namespace);
096            if (tag != null) {
097                Tag copy = tag.clone();
098                doAdd(copy);
099                return copy;
100            }
101        }
102
103        return null;
104    }
105
106    /**
107     Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes.
108     Provide a null normalName unless we already have one; will be normalized if required from tagName.
109     */
110    Tag valueOf(String tagName, @Nullable String normalName, String namespace, boolean preserveTagCase) {
111        Validate.notNull(tagName);
112        Validate.notNull(namespace);
113        tagName = tagName.trim();
114        Validate.notEmpty(tagName);
115        Tag tag = get(tagName, namespace);
116        if (tag != null) return tag;
117
118        // not found by tagName, try by normal
119        if (normalName == null) normalName = ParseSettings.normalName(tagName);
120        tagName = preserveTagCase ? tagName : normalName;
121        tag = get(normalName, namespace);
122        if (tag != null) {
123            if (preserveTagCase && !tagName.equals(normalName)) {
124                tag = tag.clone(); // copy so that the name update doesn't reset all instances
125                tag.tagName = tagName;
126                doAdd(tag);
127            }
128            return tag;
129        }
130
131        // not defined: return a new one
132        tag = new Tag(tagName, normalName, namespace);
133        doAdd(tag);
134
135        return tag;
136    }
137
138    /**
139     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
140     <p>New tags will be added to this TagSet.</p>
141
142     @param tagName Name of tag, e.g. "p".
143     @param namespace the namespace for the tag.
144     @param settings used to control tag name sensitivity
145     @return The tag, either defined or new generic.
146     */
147    public Tag valueOf(String tagName, String namespace, ParseSettings settings) {
148        return valueOf(tagName, null, namespace, settings.preserveTagCase());
149    }
150
151    /**
152     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
153     <p>New tags will be added to this TagSet.</p>
154
155     @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>.
156     @param namespace the namespace for the tag.
157     @return The tag, either defined or new generic.
158     @see #valueOf(String tagName, String namespace, ParseSettings settings)
159     */
160    public Tag valueOf(String tagName, String namespace) {
161        return valueOf(tagName, namespace, ParseSettings.preserveCase);
162    }
163
164    /**
165     Register a callback to customize each {@link Tag} as it's added to this TagSet.
166     <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p>
167
168     <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p>
169     <pre><code>
170     Parser parser = Parser.htmlParser();
171     parser.tagSet().onNewTag(tag -> {
172     if (!tag.isKnownTag())
173        tag.set(Tag.SelfClose);
174     });
175
176     Document doc = Jsoup.parse(html, parser);
177     </code></pre>
178
179     @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can
180     inspect and modify the Tag's state (e.g. set options)
181     @return this TagSet, to allow method chaining
182     @since 1.21.0
183     */
184    public TagSet onNewTag(Consumer<Tag> customizer) {
185        Validate.notNull(customizer);
186        if (customizers == null)
187            customizers = new ArrayList<>();
188        customizers.add(customizer);
189        return this;
190    }
191
192    @Override
193    public boolean equals(Object o) {
194        if (!(o instanceof TagSet)) return false;
195        TagSet tagSet = (TagSet) o;
196        return Objects.equals(tags, tagSet.tags);
197    }
198
199    @Override
200    public int hashCode() {
201        return Objects.hashCode(tags);
202    }
203
204    // Default HTML initialization
205
206    /**
207     Initialize the default HTML tag set.
208     */
209    static TagSet initHtmlDefault() {
210        String[] blockTags = {
211            "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
212            "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5",
213            "h6", "br", "button",
214            "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
215            "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
216            "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
217            "center", "template",
218            "dir", "applet", "marquee", "listing", // deprecated but still known / special handling
219            "#root" // the outer Document
220        };
221        String[] inlineTags = {
222            "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
223            "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map",
224            "q",
225            "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup",
226            "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
227            "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
228            "data", "bdi", "s", "strike", "nobr",
229            "rb", // deprecated but still known / special handling
230        };
231        String[] inlineContainers = { // can only contain inline; aka phrasing content
232            "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",
233            "ins", "del", "s", "button"
234        };
235        String[] voidTags = {
236            "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
237            "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
238        };
239        String[] preserveWhitespaceTags = {
240            "pre", "plaintext", "title", "textarea", "script"
241        };
242        String[] rcdataTags = { "title", "textarea" };
243        String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" };
244        String[] formSubmitTags = SharedConstants.FormSubmitTags;
245        String[] blockMathTags = {"math"};
246        String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"};
247        String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case
248        String[] inlineSvgTags = {"text"};
249        String[] dataSvgTags = {"script"};
250
251        return new TagSet()
252            .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block))
253            .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0))
254            .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer))
255            .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void))
256            .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace))
257            .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData))
258            .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data))
259            .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable))
260            .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block))
261            .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0))
262            .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block))
263            .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0))
264            .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data))
265            ;
266    }
267
268    private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) {
269        for (String tagName : tagNames) {
270            Tag tag = get(tagName, namespace);
271            if (tag == null) {
272                tag = new Tag(tagName, tagName, namespace); // normal name is already normal here
273                tag.options = 0; // clear defaults
274                add(tag);
275            }
276            tagModifier.accept(tag);
277        }
278        return this;
279    }
280}