001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jspecify.annotations.Nullable; 006 007import java.util.ArrayList; 008import java.util.HashMap; 009import java.util.Map; 010import java.util.Objects; 011import java.util.function.Consumer; 012 013import static org.jsoup.parser.Parser.NamespaceHtml; 014import static org.jsoup.parser.Parser.NamespaceMathml; 015import static org.jsoup.parser.Parser.NamespaceSvg; 016 017/** 018 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial 019 defaults, and after the parse, any additionally discovered tags. 020 021 @see Parser#tagSet(TagSet) 022 @since 1.20.1 023 */ 024public class TagSet { 025 static final TagSet HtmlTagSet = initHtmlDefault(); 026 027 private final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag 028 private final @Nullable TagSet source; // source to pull tags from on demand 029 private @Nullable ArrayList<Consumer<Tag>> customizers; // optional onNewTag tag customizer 030 031 /** 032 Returns a mutable copy of the default HTML tag set. 033 */ 034 public static TagSet Html() { 035 return new TagSet(HtmlTagSet); 036 } 037 038 public TagSet() { 039 source = null; 040 } 041 042 public TagSet(TagSet original) { 043 this.source = original; 044 } 045 046 /** 047 Insert a tag into this TagSet. If the tag already exists, it is replaced. 048 <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via 049 .valueOf() if not already in the set.</p> 050 051 @param tag the tag to add 052 @return this TagSet 053 */ 054 public TagSet add(Tag tag) { 055 tag.set(Tag.Known); 056 doAdd(tag); 057 return this; 058 } 059 060 /** Adds the tag, but does not set defined. Used in .valueOf */ 061 private void doAdd(Tag tag) { 062 if (customizers != null) { 063 for (Consumer<Tag> customizer : customizers) { 064 customizer.accept(tag); 065 } 066 } 067 068 tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>()) 069 .put(tag.tagName, tag); 070 } 071 072 /** 073 Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed 074 instances. 075 076 @param tagName the case-sensitive tag name 077 @param namespace the namespace 078 @return the tag, or null if not found 079 */ 080 public @Nullable Tag get(String tagName, String namespace) { 081 Validate.notNull(tagName); 082 Validate.notNull(namespace); 083 084 // get from our tags 085 Map<String, Tag> nsTags = tags.get(namespace); 086 if (nsTags != null) { 087 Tag tag = nsTags.get(tagName); 088 if (tag != null) { 089 return tag; 090 } 091 } 092 093 // not found; clone on demand from source if exists 094 if (source != null) { 095 Tag tag = source.get(tagName, namespace); 096 if (tag != null) { 097 Tag copy = tag.clone(); 098 doAdd(copy); 099 return copy; 100 } 101 } 102 103 return null; 104 } 105 106 /** 107 Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. 108 Provide a null normalName unless we already have one; will be normalized if required from tagName. 109 */ 110 Tag valueOf(String tagName, @Nullable String normalName, String namespace, boolean preserveTagCase) { 111 Validate.notNull(tagName); 112 Validate.notNull(namespace); 113 tagName = tagName.trim(); 114 Validate.notEmpty(tagName); 115 Tag tag = get(tagName, namespace); 116 if (tag != null) return tag; 117 118 // not found by tagName, try by normal 119 if (normalName == null) normalName = ParseSettings.normalName(tagName); 120 tagName = preserveTagCase ? tagName : normalName; 121 tag = get(normalName, namespace); 122 if (tag != null) { 123 if (preserveTagCase && !tagName.equals(normalName)) { 124 tag = tag.clone(); // copy so that the name update doesn't reset all instances 125 tag.tagName = tagName; 126 doAdd(tag); 127 } 128 return tag; 129 } 130 131 // not defined: return a new one 132 tag = new Tag(tagName, normalName, namespace); 133 doAdd(tag); 134 135 return tag; 136 } 137 138 /** 139 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 140 <p>New tags will be added to this TagSet.</p> 141 142 @param tagName Name of tag, e.g. "p". 143 @param namespace the namespace for the tag. 144 @param settings used to control tag name sensitivity 145 @return The tag, either defined or new generic. 146 */ 147 public Tag valueOf(String tagName, String namespace, ParseSettings settings) { 148 return valueOf(tagName, null, namespace, settings.preserveTagCase()); 149 } 150 151 /** 152 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 153 <p>New tags will be added to this TagSet.</p> 154 155 @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>. 156 @param namespace the namespace for the tag. 157 @return The tag, either defined or new generic. 158 @see #valueOf(String tagName, String namespace, ParseSettings settings) 159 */ 160 public Tag valueOf(String tagName, String namespace) { 161 return valueOf(tagName, namespace, ParseSettings.preserveCase); 162 } 163 164 /** 165 Register a callback to customize each {@link Tag} as it's added to this TagSet. 166 <p>Customizers are invoked once per Tag, when they are added (explicitly or via the valueOf methods).</p> 167 168 <p>For example, to allow all unknown tags to be self-closing during when parsing as HTML:</p> 169 <pre><code> 170 Parser parser = Parser.htmlParser(); 171 parser.tagSet().onNewTag(tag -> { 172 if (!tag.isKnownTag()) 173 tag.set(Tag.SelfClose); 174 }); 175 176 Document doc = Jsoup.parse(html, parser); 177 </code></pre> 178 179 @param customizer a {@code Consumer<Tag>} that will be called for each newly added or cloned Tag; callers can 180 inspect and modify the Tag's state (e.g. set options) 181 @return this TagSet, to allow method chaining 182 @since 1.21.0 183 */ 184 public TagSet onNewTag(Consumer<Tag> customizer) { 185 Validate.notNull(customizer); 186 if (customizers == null) 187 customizers = new ArrayList<>(); 188 customizers.add(customizer); 189 return this; 190 } 191 192 @Override 193 public boolean equals(Object o) { 194 if (!(o instanceof TagSet)) return false; 195 TagSet tagSet = (TagSet) o; 196 return Objects.equals(tags, tagSet.tags); 197 } 198 199 @Override 200 public int hashCode() { 201 return Objects.hashCode(tags); 202 } 203 204 // Default HTML initialization 205 206 /** 207 Initialize the default HTML tag set. 208 */ 209 static TagSet initHtmlDefault() { 210 String[] blockTags = { 211 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 212 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", 213 "h6", "br", "button", 214 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 215 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 216 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 217 "center", "template", 218 "dir", "applet", "marquee", "listing", // deprecated but still known / special handling 219 "#root" // the outer Document 220 }; 221 String[] inlineTags = { 222 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 223 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map", 224 "q", 225 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 226 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 227 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 228 "data", "bdi", "s", "strike", "nobr", 229 "rb", // deprecated but still known / special handling 230 }; 231 String[] inlineContainers = { // can only contain inline; aka phrasing content 232 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 233 "ins", "del", "s", "button" 234 }; 235 String[] voidTags = { 236 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 237 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 238 }; 239 String[] preserveWhitespaceTags = { 240 "pre", "plaintext", "title", "textarea", "script" 241 }; 242 String[] rcdataTags = { "title", "textarea" }; 243 String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" }; 244 String[] formSubmitTags = SharedConstants.FormSubmitTags; 245 String[] blockMathTags = {"math"}; 246 String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"}; 247 String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case 248 String[] inlineSvgTags = {"text"}; 249 String[] dataSvgTags = {"script"}; 250 251 return new TagSet() 252 .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block)) 253 .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0)) 254 .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer)) 255 .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void)) 256 .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace)) 257 .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData)) 258 .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data)) 259 .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable)) 260 .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block)) 261 .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0)) 262 .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block)) 263 .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0)) 264 .setupTags(NamespaceSvg, dataSvgTags, tag -> tag.set(Tag.Data)) 265 ; 266 } 267 268 private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) { 269 for (String tagName : tagNames) { 270 Tag tag = get(tagName, namespace); 271 if (tag == null) { 272 tag = new Tag(tagName, tagName, namespace); // normal name is already normal here 273 tag.options = 0; // clear defaults 274 add(tag); 275 } 276 tagModifier.accept(tag); 277 } 278 return this; 279 } 280}