001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SharedConstants; 005import org.jspecify.annotations.Nullable; 006 007import java.util.HashMap; 008import java.util.Map; 009import java.util.Objects; 010import java.util.function.Consumer; 011 012import static org.jsoup.parser.Parser.NamespaceHtml; 013import static org.jsoup.parser.Parser.NamespaceMathml; 014import static org.jsoup.parser.Parser.NamespaceSvg; 015 016/** 017 A TagSet controls the {@link Tag} configuration for a Document's parse, and its serialization. It contains the initial 018 defaults, and after the parse, any additionally discovered tags. 019 020 @see Parser#tagSet(TagSet) 021 @since 1.20.1 022 */ 023public class TagSet { 024 static final TagSet HtmlTagSet = initHtmlDefault(); 025 026 final Map<String, Map<String, Tag>> tags = new HashMap<>(); // namespace -> tag name -> Tag 027 final @Nullable TagSet source; // source to pull tags from on demand 028 029 /** 030 Returns a mutable copy of the default HTML tag set. 031 */ 032 public static TagSet Html() { 033 return new TagSet(HtmlTagSet); 034 } 035 036 public TagSet() { 037 source = null; 038 } 039 040 public TagSet(TagSet original) { 041 this.source = original; 042 } 043 044 /** 045 Insert a tag into this TagSet. If the tag already exists, it is replaced. 046 <p>Tags explicitly added like this are considered to be known tags (vs those that are dynamically created via 047 .valueOf() if not already in the set.</p> 048 049 @param tag the tag to add 050 @return this TagSet 051 */ 052 public TagSet add(Tag tag) { 053 tag.set(Tag.Known); 054 doAdd(tag); 055 return this; 056 } 057 058 /** Adds the tag, but does not set defined. Used in .valueOf */ 059 private void doAdd(Tag tag) { 060 tags.computeIfAbsent(tag.namespace, ns -> new HashMap<>()) 061 .put(tag.tagName, tag); 062 } 063 064 /** 065 Get an existing Tag from this TagSet by tagName and namespace. The tag name is not normalized, to support mixed 066 instances. 067 068 @param tagName the case-sensitive tag name 069 @param namespace the namespace 070 @return the tag, or null if not found 071 */ 072 public @Nullable Tag get(String tagName, String namespace) { 073 Validate.notNull(tagName); 074 Validate.notNull(namespace); 075 076 // get from our tags 077 Map<String, Tag> nsTags = tags.get(namespace); 078 if (nsTags != null) { 079 Tag tag = nsTags.get(tagName); 080 if (tag != null) { 081 return tag; 082 } 083 } 084 085 // not found; clone on demand from source if exists 086 if (source != null) { 087 Tag tag = source.get(tagName, namespace); 088 if (tag != null) { 089 Tag copy = tag.clone(); 090 doAdd(copy); 091 return copy; 092 } 093 } 094 095 return null; 096 } 097 098 /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */ 099 Tag valueOf(String tagName, String normalName, String namespace, boolean preserveTagCase) { 100 Validate.notNull(tagName); 101 Validate.notNull(namespace); 102 tagName = tagName.trim(); 103 Validate.notEmpty(tagName); 104 Tag tag = get(tagName, namespace); 105 if (tag != null) return tag; 106 107 // not found by tagName, try by normal 108 tagName = preserveTagCase ? tagName : normalName; 109 tag = get(normalName, namespace); 110 if (tag != null) { 111 if (preserveTagCase && !tagName.equals(normalName)) { 112 tag = tag.clone(); // copy so that the name update doesn't reset all instances 113 tag.tagName = tagName; 114 doAdd(tag); 115 } 116 return tag; 117 } 118 119 // not defined: return a new one 120 tag = new Tag(tagName, normalName, namespace); 121 doAdd(tag); 122 123 return tag; 124 } 125 126 /** 127 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 128 <p>New tags will be added to this TagSet.</p> 129 130 @param tagName Name of tag, e.g. "p". 131 @param namespace the namespace for the tag. 132 @param settings used to control tag name sensitivity 133 @return The tag, either defined or new generic. 134 */ 135 public Tag valueOf(String tagName, String namespace, ParseSettings settings) { 136 return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase()); 137 } 138 139 /** 140 Get a Tag by name from this TagSet. If not previously defined (unknown), returns a new tag. 141 <p>New tags will be added to this TagSet.</p> 142 143 @param tagName Name of tag, e.g. "p". <b>Case-sensitive</b>. 144 @param namespace the namespace for the tag. 145 @return The tag, either defined or new generic. 146 @see #valueOf(String tagName, String namespace, ParseSettings settings) 147 */ 148 public Tag valueOf(String tagName, String namespace) { 149 return valueOf(tagName, namespace, ParseSettings.preserveCase); 150 } 151 152 @Override 153 public boolean equals(Object o) { 154 if (!(o instanceof TagSet)) return false; 155 TagSet tagSet = (TagSet) o; 156 return Objects.equals(tags, tagSet.tags); 157 } 158 159 @Override 160 public int hashCode() { 161 return Objects.hashCode(tags); 162 } 163 164 // Default HTML initialization 165 166 /** 167 Initialize the default HTML tag set. 168 */ 169 static TagSet initHtmlDefault() { 170 String[] blockTags = { 171 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 172 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", 173 "h6", "br", "button", 174 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 175 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 176 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 177 "center", "template", 178 "dir", "applet", "marquee", "listing", // deprecated but still known / special handling 179 "#root" // the outer Document 180 }; 181 String[] inlineTags = { 182 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 183 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "wbr", "map", 184 "q", 185 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 186 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 187 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 188 "data", "bdi", "s", "strike", "nobr", 189 "rb", // deprecated but still known / special handling 190 }; 191 String[] inlineContainers = { // can only contain inline; aka phrasing content 192 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 193 "ins", "del", "s", "button" 194 }; 195 String[] voidTags = { 196 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 197 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 198 }; 199 String[] preserveWhitespaceTags = { 200 "pre", "plaintext", "title", "textarea", "script" 201 }; 202 String[] rcdataTags = { "title", "textarea" }; 203 String[] dataTags = { "iframe", "noembed", "noframes", "script", "style", "xmp" }; 204 String[] formSubmitTags = SharedConstants.FormSubmitTags; 205 String[] blockMathTags = {"math"}; 206 String[] inlineMathTags = {"mi", "mo", "msup", "mn", "mtext"}; 207 String[] blockSvgTags = {"svg", "femerge", "femergenode"}; // note these are LC versions, but actually preserve case 208 String[] inlineSvgTags = {"text"}; 209 210 return new TagSet() 211 .setupTags(NamespaceHtml, blockTags, tag -> tag.set(Tag.Block)) 212 .setupTags(NamespaceHtml, inlineTags, tag -> tag.set(0)) 213 .setupTags(NamespaceHtml, inlineContainers, tag -> tag.set(Tag.InlineContainer)) 214 .setupTags(NamespaceHtml, voidTags, tag -> tag.set(Tag.Void)) 215 .setupTags(NamespaceHtml, preserveWhitespaceTags, tag -> tag.set(Tag.PreserveWhitespace)) 216 .setupTags(NamespaceHtml, rcdataTags, tag -> tag.set(Tag.RcData)) 217 .setupTags(NamespaceHtml, dataTags, tag -> tag.set(Tag.Data)) 218 .setupTags(NamespaceHtml, formSubmitTags, tag -> tag.set(Tag.FormSubmittable)) 219 .setupTags(NamespaceMathml, blockMathTags, tag -> tag.set(Tag.Block)) 220 .setupTags(NamespaceMathml, inlineMathTags, tag -> tag.set(0)) 221 .setupTags(NamespaceSvg, blockSvgTags, tag -> tag.set(Tag.Block)) 222 .setupTags(NamespaceSvg, inlineSvgTags, tag -> tag.set(0)) 223 ; 224 } 225 226 private TagSet setupTags(String namespace, String[] tagNames, Consumer<Tag> tagModifier) { 227 for (String tagName : tagNames) { 228 Tag tag = get(tagName, namespace); 229 if (tag == null) { 230 tag = new Tag(tagName, tagName, namespace); // normal name is already normal here 231 tag.options = 0; // clear defaults 232 add(tag); 233 } 234 tagModifier.accept(tag); 235 } 236 return this; 237 } 238}