001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jspecify.annotations.Nullable; 006 007import java.util.ArrayList; 008import java.util.HashMap; 009import java.util.Map; 010import java.util.Objects; 011import java.util.function.Consumer; 012 013import static org.jsoup.parser.Parser.NamespaceHtml; 014import static org.jsoup.parser.Parser.NamespaceMathml; 015import static org.jsoup.parser.Parser.NamespaceSvg; 016 017/** 018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial 019 defaults, and after the parse, any additionally discovered tags. 020 021 @see Parser#tagSet(TagSet) 022 @since 1.20.1 023 */ 024public class TagSet { 025 static final TagSet HtmlTagSet = initHtmlDefault(); 026 027 private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag 028 private final @Nullable TagSet source; // source to pull tags from on demand 029 private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer 030 031 /** 032 Returns a mutable copy of the default HTML tag set. 033 */ 034 public static TagSet Html() { 035 return new TagSet(HtmlTagSet); 036 } 037 038 public TagSet() { 039 source = null; 040 } 041 042 public TagSet(TagSet original) { 043 this.source = original; 044 } 045 046 /** 047 Insert a tag into this TagSet. If the tag already exists, it is replaced. 048 <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via 049 .valueOf() if not already in the set.</p> 050 051 @param tag the tag to add 052 @return this TagSet 053 */ 054 public TagSet add(Tag tag) { 055 tag.set(Tag.Known); 056 doAdd(tag); 057 return this; 058 } 059 060 /** Adds the tag, but does not set defined. Used in .valueOf */ 061 private void doAdd(Tag tag) { 062 if (customizers != null) { 063 for (Consumer<Tag> customizer : customizers) { 064 customizer.accept(tag); 065 } 066 } 067 068 tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>()) 069 .put(tag.tagName, tag); 070 } 071 072 /** 073 Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed 074 instances. 075 076 @param tagName the case-sensitive tag name 077 @param namespace the namespace 078 @return the tag, or null if not found 079 */ 080 public @Nullable Tag get(String tagName, String namespace) { 081 Validate.notNull(tagName); 082 Validate.notNull(namespace); 083 084 // get from our tags 085 Map<String, Tag> nsTags = tags.get(namespace); 086 if (nsTags != null) { 087 Tag tag = nsTags.get(tagName); 088 if (tag != null) { 089 return tag; 090 } 091 } 092 093 // not found; clone on demand from source if exists 094 if (source != null) { 095 Tag tag = source.get(tagName, namespace); 096 if (tag != null) { 097 Tag copy = tag.clone(); 098 doAdd(copy); 099 return copy; 100 } 101 } 102 103 return null; 104 } 105 106 /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */ 107 Tag valueOf(String tagName, String normalName, String namespace, boolean preserveTagCase) { 108 Validate.notNull(tagName); 109 Validate.notNull(namespace); 110 tagName = tagName.trim(); 111 Validate.notEmpty(tagName); 112 Tag tag = get(tagName, namespace); 113 if (tag != null) return tag; 114 115 // not found by tagName, try by normal 116 tagName = preserveTagCase ? tagName : normalName; 117 tag = get(normalName, namespace); 118 if (tag != null) { 119 if (preserveTagCase && !tagName.equals(normalName)) { 120 tag = tag.clone(); // copy so that the name update doesn't reset all instances 121 tag.tagName = tagName; 122 doAdd(tag); 123 } 124 return tag; 125 } 126 127 // not defined: return a new one 128 tag = new Tag(tagName, normalName, namespace); 129 doAdd(tag); 130 131 return tag; 132 } 133 134 /** 135 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 136 <p>New tags will be added to this TagSet.</p> 137 138 @param tagName Name of tag, e.g. "p". 139 @param namespace the namespace for the tag. 140 @param settings used to control tag name sensitivity 141 @return The tag, either defined or new generic. 142 */ 143 public Tag valueOf(String tagName, String namespace, ParseSettings settings) { 144 return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase()); 145 } 146 147 /** 148 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 149 <p>New tags will be added to this TagSet.</p> 150 151 @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>. 152 @param namespace the namespace for the tag. 153 @return The tag, either defined or new generic. 154 @see #valueOf(String tagName, String namespace, ParseSettings settings) 155 */ 156 public Tag valueOf(String tagName, String namespace) { 157 return valueOf(tagName, namespace, ParseSettings.preserveCase); 158 } 159 160 /** 161 Register a callback to customize each {@link Tag} as it's added to this TagSet. 162 <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p> 163 164 <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p> 165 <pre><code> 166 Parser parser = Parser.htmlParser(); 167 parser.tagSet().onNewTag(tag -> { 168 if (!tag.isKnownTag()) 169 tag.set(Tag.SelfClose); 170 }); 171 172 Document doc = Jsoup.parse(html, parser); 173 </code></pre> 174 175 @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can 176 inspect and modify the Tag's state (e.g. set options) 177 @return this TagSet, to allow method chaining 178 @since 1.21.0 179 */ 180 public TagSet onNewTag(Consumer<Tag> customizer) { 181 Validate.notNull(customizer); 182 if (customizers == null) 183 customizers = new ArrayList<>(); 184 customizers.add(customizer); 185 return this; 186 } 187 188 @Override 189 public boolean equals(Object o) { 190 if (!(o instanceof TagSet)) return false; 191 TagSet tagSet = (TagSet) o; 192 return Objects.equals(tags, tagSet.tags); 193 } 194 195 @Override 196 public int hashCode() { 197 return Objects.hashCode(tags); 198 } 199 200 // Default HTML initialization 201 202 /** 203 Initialize the default HTML tag set. 204 */ 205 static TagSet initHtmlDefault() { 206 String[] blockTags = { 207 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 208 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", 209 "h6", "br", "button", 210 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 211 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 212 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 213 "center", "template", 214 "dir", "applet", "marquee", "listing", // deprecated but still known / special handling 215 "#root" // the outer Document 216 }; 217 String[] inlineTags = { 218 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 219 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map", 220 "q", 221 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 222 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 223 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 224 "data", "bdi", "s", "strike", "nobr", 225 "rb", // deprecated but still known / special handling 226 }; 227 String[] inlineContainers = { // can only contain inline; aka phrasing content 228 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 229 "ins", "del", "s", "button" 230 }; 231 String[] voidTags = { 232 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 233 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 234 }; 235 String[] preserveWhitespaceTags = { 236 "pre", "plaintext", "title", "textarea", "script" 237 }; 238 String[] rcdataTags = { "title", "textarea" }; 239 String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" }; 240 String[] formSubmitTags = SharedConstants.FormSubmitTags; 241 String[] blockMathTags = {"math"}; 242 String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"}; 243 String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case 244 String[] inlineSvgTags = {"text"}; 245 String[] dataSvgTags = {"script"}; 246 247 return new TagSet() 248 .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block)) 249 .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0)) 250 .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer)) 251 .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void)) 252 .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace)) 253 .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData)) 254 .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data)) 255 .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable)) 256 .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block)) 257 .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0)) 258 .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block)) 259 .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0)) 260 .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data)) 261 ; 262 } 263 264 private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) { 265 for (String tagName : tagNames) { 266 Tag tag = get(tagName, namespace); 267 if (tag == null) { 268 tag = new Tag(tagName, tagName, namespace); // normal name is already normal here 269 tag.options = 0; // clear defaults 270 add(tag); 271 } 272 tagModifier.accept(tag); 273 } 274 return this; 275 } 276}