001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SharedConstants;
005import org.jspecify.annotations.Nullable;
006
007import java.util.ArrayList;
008import java.util.HashMap;
009import java.util.Map;
010import java.util.Objects;
011import java.util.function.Consumer;
012
013import static org.jsoup.parser.Parser.NamespaceHtml;
014import static org.jsoup.parser.Parser.NamespaceMathml;
015import static org.jsoup.parser.Parser.NamespaceSvg;
016
017/**
018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial
019 defaults, and after the parse, any additionally discovered tags.
020
021 @see Parser#tagSet(TagSet)
022 @since 1.20.1
023 */
024public class TagSet {
025    static final TagSet HtmlTagSet = initHtmlDefault();
026
027    private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag
028    private final @Nullable TagSet source; // source to pull tags from on demand
029    private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer
030
031    /**
032     Returns a mutable copy of the default HTML tag set.
033     */
034    public static TagSet Html() {
035        return new TagSet(HtmlTagSet);
036    }
037
038    public TagSet() {
039        source = null;
040    }
041
042    public TagSet(TagSet original) {
043        this.source = original;
044    }
045
046    /**
047     Insert a tag into this TagSet. If the tag already exists, it is replaced.
048     <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via
049     .valueOf() if not already in the set.</p>
050
051     @param tag the tag to add
052     @return this TagSet
053     */
054    public TagSet add(Tag tag) {
055        tag.set(Tag.Known);
056        doAdd(tag);
057        return this;
058    }
059
060    /** Adds the tag, but does not set defined. Used in .valueOf */
061    private void doAdd(Tag tag) {
062        if (customizers != null) {
063            for (Consumer<Tag> customizer : customizers) {
064                customizer.accept(tag);
065            }
066        }
067
068        tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>())
069            .put(tag.tagName, tag);
070    }
071
072    /**
073     Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed
074     instances.
075
076     @param tagName the case-sensitive tag name
077     @param namespace the namespace
078     @return the tag, or null if not found
079     */
080    public @Nullable Tag get(String tagName, String namespace) {
081        Validate.notNull(tagName);
082        Validate.notNull(namespace);
083
084        // get from our tags
085        Map<String, Tag> nsTags = tags.get(namespace);
086        if (nsTags != null) {
087            Tag tag = nsTags.get(tagName);
088            if (tag != null) {
089                return tag;
090            }
091        }
092
093        // not found; clone on demand from source if exists
094        if (source != null) {
095            Tag tag = source.get(tagName, namespace);
096            if (tag != null) {
097                Tag copy = tag.clone();
098                doAdd(copy);
099                return copy;
100            }
101        }
102
103        return null;
104    }
105
106    /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */
107    Tag valueOf(String tagName, String normalName, String namespace, boolean preserveTagCase) {
108        Validate.notNull(tagName);
109        Validate.notNull(namespace);
110        tagName = tagName.trim();
111        Validate.notEmpty(tagName);
112        Tag tag = get(tagName, namespace);
113        if (tag != null) return tag;
114
115        // not found by tagName, try by normal
116        tagName = preserveTagCase ? tagName : normalName;
117        tag = get(normalName, namespace);
118        if (tag != null) {
119            if (preserveTagCase && !tagName.equals(normalName)) {
120                tag = tag.clone(); // copy so that the name update doesn't reset all instances
121                tag.tagName = tagName;
122                doAdd(tag);
123            }
124            return tag;
125        }
126
127        // not defined: return a new one
128        tag = new Tag(tagName, normalName, namespace);
129        doAdd(tag);
130
131        return tag;
132    }
133
134    /**
135     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
136     <p>New tags will be added to this TagSet.</p>
137
138     @param tagName Name of tag, e.g. "p".
139     @param namespace the namespace for the tag.
140     @param settings used to control tag name sensitivity
141     @return The tag, either defined or new generic.
142     */
143    public Tag valueOf(String tagName, String namespace, ParseSettings settings) {
144        return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase());
145    }
146
147    /**
148     Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag.
149     <p>New tags will be added to this TagSet.</p>
150
151     @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>.
152     @param namespace the namespace for the tag.
153     @return The tag, either defined or new generic.
154     @see #valueOf(String tagName, String namespace, ParseSettings settings)
155     */
156    public Tag valueOf(String tagName, String namespace) {
157        return valueOf(tagName, namespace, ParseSettings.preserveCase);
158    }
159
160    /**
161     Register a callback to customize each {@link Tag} as it's added to this TagSet.
162     <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p>
163
164     <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p>
165     <pre><code>
166     Parser parser = Parser.htmlParser();
167     parser.tagSet().onNewTag(tag -> {
168     if (!tag.isKnownTag())
169        tag.set(Tag.SelfClose);
170     });
171
172     Document doc = Jsoup.parse(html, parser);
173     </code></pre>
174
175     @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can
176     inspect and modify the Tag's state (e.g. set options)
177     @return this TagSet, to allow method chaining
178     @since 1.21.0
179     */
180    public TagSet onNewTag(Consumer<Tag> customizer) {
181        Validate.notNull(customizer);
182        if (customizers == null)
183            customizers = new ArrayList<>();
184        customizers.add(customizer);
185        return this;
186    }
187
188    @Override
189    public boolean equals(Object o) {
190        if (!(o instanceof TagSet)) return false;
191        TagSet tagSet = (TagSet) o;
192        return Objects.equals(tags, tagSet.tags);
193    }
194
195    @Override
196    public int hashCode() {
197        return Objects.hashCode(tags);
198    }
199
200    // Default HTML initialization
201
202    /**
203     Initialize the default HTML tag set.
204     */
205    static TagSet initHtmlDefault() {
206        String[] blockTags = {
207            "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
208            "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5",
209            "h6", "br", "button",
210            "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
211            "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
212            "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
213            "center", "template",
214            "dir", "applet", "marquee", "listing", // deprecated but still known / special handling
215            "#root" // the outer Document
216        };
217        String[] inlineTags = {
218            "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd",
219            "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map",
220            "q",
221            "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup",
222            "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
223            "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
224            "data", "bdi", "s", "strike", "nobr",
225            "rb", // deprecated but still known / special handling
226        };
227        String[] inlineContainers = { // can only contain inline; aka phrasing content
228            "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style",
229            "ins", "del", "s", "button"
230        };
231        String[] voidTags = {
232            "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",
233            "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track"
234        };
235        String[] preserveWhitespaceTags = {
236            "pre", "plaintext", "title", "textarea", "script"
237        };
238        String[] rcdataTags = { "title", "textarea" };
239        String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" };
240        String[] formSubmitTags = SharedConstants.FormSubmitTags;
241        String[] blockMathTags = {"math"};
242        String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"};
243        String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case
244        String[] inlineSvgTags = {"text"};
245        String[] dataSvgTags = {"script"};
246
247        return new TagSet()
248            .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block))
249            .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0))
250            .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer))
251            .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void))
252            .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace))
253            .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData))
254            .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data))
255            .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable))
256            .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block))
257            .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0))
258            .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block))
259            .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0))
260            .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data))
261            ;
262    }
263
264    private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) {
265        for (String tagName : tagNames) {
266            Tag tag = get(tagName, namespace);
267            if (tag == null) {
268                tag = new Tag(tagName, tagName, namespace); // normal name is already normal here
269                tag.options = 0; // clear defaults
270                add(tag);
271            }
272            tagModifier.accept(tag);
273        }
274        return this;
275    }
276}