001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.SharedConstants; 006 007import java.util.HashMap; 008import java.util.Map; 009import java.util.Objects; 010import java.util.function.Consumer; 011 012/** 013 * Tag capabilities. 014 * 015 * @author Jonathan Hedley, jonathan@hedley.net 016 */ 017public class Tag implements Cloneable { 018 private static final Map<String, Tag> Tags = new HashMap<>(); // map of known tags 019 020 private String tagName; 021 private final String normalName; // always the lower case version of this tag, regardless of case preservation mode 022 private String namespace; 023 private boolean isBlock = true; // block 024 private boolean formatAsBlock = true; // should be formatted as a block 025 private boolean empty = false; // can hold nothing; e.g. img 026 private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. 027 private boolean preserveWhitespace = false; // for pre, textarea, script etc 028 private boolean formList = false; // a control that appears in forms: input, textarea, output etc 029 private boolean formSubmit = false; // a control that can be submitted in a form: input etc 030 031 private Tag(String tagName, String normalName, String namespace) { 032 this.tagName = tagName; 033 this.normalName = normalName; 034 this.namespace = namespace; 035 } 036 037 /** 038 * Get this tag's name. 039 * 040 * @return the tag's name 041 */ 042 public String getName() { 043 return tagName; 044 } 045 046 /** 047 * Get this tag's normalized (lowercased) name. 048 * @return the tag's normal name. 049 */ 050 public String normalName() { 051 return normalName; 052 } 053 054 public String namespace() { 055 return namespace; 056 } 057 058 /** 059 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 060 * <p> 061 * Pre-defined tags (p, div etc) will be ==, but unknown tags are not registered and will only .equals(). 062 * </p> 063 * 064 * @param tagName Name of tag, e.g. "p". Case-insensitive. 065 * @param namespace the namespace for the tag. 066 * @param settings used to control tag name sensitivity 067 * @return The tag, either defined or new generic. 068 */ 069 public static Tag valueOf(String tagName, String namespace, ParseSettings settings) { 070 return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings); 071 } 072 073 /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */ 074 static Tag valueOf(String tagName, String normalName, String namespace, ParseSettings settings) { 075 Validate.notNull(tagName); 076 tagName = tagName.trim(); 077 Validate.notEmpty(tagName); 078 Validate.notNull(namespace); 079 Tag tag = Tags.get(tagName); 080 if (tag != null && tag.namespace.equals(namespace)) 081 return tag; 082 083 tagName = settings.preserveTagCase() ? tagName : normalName; 084 tag = Tags.get(normalName); 085 if (tag != null && tag.namespace.equals(namespace)) { 086 if (settings.preserveTagCase() && !tagName.equals(normalName)) { 087 tag = tag.clone(); // get a new version vs the static one, so name update doesn't reset all 088 tag.tagName = tagName; 089 } 090 return tag; 091 } 092 093 // not defined: create default; go anywhere, do anything! (incl be inside a <p>) 094 tag = new Tag(tagName, normalName, namespace); 095 tag.isBlock = false; 096 097 return tag; 098 } 099 100 101 /** 102 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 103 * <p> 104 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 105 * </p> 106 * 107 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 108 * @return The tag, either defined or new generic. 109 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 110 */ 111 public static Tag valueOf(String tagName) { 112 return valueOf(tagName, Parser.NamespaceHtml, ParseSettings.preserveCase); 113 } 114 115 /** 116 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 117 * <p> 118 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 119 * </p> 120 * 121 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 122 * @param settings used to control tag name sensitivity 123 * @return The tag, either defined or new generic. 124 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 125 */ 126 public static Tag valueOf(String tagName, ParseSettings settings) { 127 return valueOf(tagName, Parser.NamespaceHtml, settings); 128 } 129 130 /** 131 * Gets if this is a block tag. 132 * 133 * @return if block tag 134 */ 135 public boolean isBlock() { 136 return isBlock; 137 } 138 139 /** 140 * Gets if this tag should be formatted as a block (or as inline) 141 * 142 * @return if should be formatted as block or inline 143 */ 144 public boolean formatAsBlock() { 145 return formatAsBlock; 146 } 147 148 /** 149 * Gets if this tag is an inline tag. 150 * 151 * @return if this tag is an inline tag. 152 */ 153 public boolean isInline() { 154 return !isBlock; 155 } 156 157 /** 158 * Get if this is an empty tag 159 * 160 * @return if this is an empty tag 161 */ 162 public boolean isEmpty() { 163 return empty; 164 } 165 166 /** 167 * Get if this tag is self-closing. 168 * 169 * @return if this tag should be output as self-closing. 170 */ 171 public boolean isSelfClosing() { 172 return empty || selfClosing; 173 } 174 175 /** 176 * Get if this is a pre-defined tag, or was auto created on parsing. 177 * 178 * @return if a known tag 179 */ 180 public boolean isKnownTag() { 181 return Tags.containsKey(tagName); 182 } 183 184 /** 185 * Check if this tagname is a known tag. 186 * 187 * @param tagName name of tag 188 * @return if known HTML tag 189 */ 190 public static boolean isKnownTag(String tagName) { 191 return Tags.containsKey(tagName); 192 } 193 194 /** 195 * Get if this tag should preserve whitespace within child text nodes. 196 * 197 * @return if preserve whitespace 198 */ 199 public boolean preserveWhitespace() { 200 return preserveWhitespace; 201 } 202 203 /** 204 * Get if this tag represents a control associated with a form. E.g. input, textarea, output 205 * @return if associated with a form 206 */ 207 public boolean isFormListed() { 208 return formList; 209 } 210 211 /** 212 * Get if this tag represents an element that should be submitted with a form. E.g. input, option 213 * @return if submittable with a form 214 */ 215 public boolean isFormSubmittable() { 216 return formSubmit; 217 } 218 219 Tag setSelfClosing() { 220 selfClosing = true; 221 return this; 222 } 223 224 @Override 225 public boolean equals(Object o) { 226 if (this == o) return true; 227 if (!(o instanceof Tag)) return false; 228 229 Tag tag = (Tag) o; 230 231 if (!tagName.equals(tag.tagName)) return false; 232 if (empty != tag.empty) return false; 233 if (formatAsBlock != tag.formatAsBlock) return false; 234 if (isBlock != tag.isBlock) return false; 235 if (preserveWhitespace != tag.preserveWhitespace) return false; 236 if (selfClosing != tag.selfClosing) return false; 237 if (formList != tag.formList) return false; 238 return formSubmit == tag.formSubmit; 239 } 240 241 @Override 242 public int hashCode() { 243 return Objects.hash(tagName, isBlock, formatAsBlock, empty, selfClosing, preserveWhitespace, 244 formList, formSubmit); 245 } 246 247 @Override 248 public String toString() { 249 return tagName; 250 } 251 252 @Override 253 protected Tag clone() { 254 try { 255 return (Tag) super.clone(); 256 } catch (CloneNotSupportedException e) { 257 throw new RuntimeException(e); 258 } 259 } 260 261 // internal static initialisers: 262 // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources 263 private static final String[] blockTags = { 264 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 265 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", 266 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 267 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 268 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 269 "svg", "math", "center", "template", 270 "dir", "applet", "marquee", "listing" // deprecated but still known / special handling 271 }; 272 private static final String[] inlineTags = { 273 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 274 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "br", "wbr", "map", "q", 275 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 276 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 277 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 278 "data", "bdi", "s", "strike", "nobr", 279 "rb", // deprecated but still known / special handling 280 "text", // in SVG NS 281 "mi", "mo", "msup", "mn", "mtext" // in MathML NS, to ensure inline 282 }; 283 private static final String[] emptyTags = { 284 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 285 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 286 }; 287 // todo - rework this to format contents as inline; and update html emitter in Element. Same output, just neater. 288 private static final String[] formatAsInlineTags = { 289 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 290 "ins", "del", "s", "button" 291 }; 292 private static final String[] preserveWhitespaceTags = { 293 "pre", "plaintext", "title", "textarea" 294 // script is not here as it is a data node, which always preserve whitespace 295 }; 296 // todo: I think we just need submit tags, and can scrub listed 297 private static final String[] formListedTags = { 298 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 299 }; 300 private static final String[] formSubmitTags = SharedConstants.FormSubmitTags; 301 302 private static final Map<String, String[]> namespaces = new HashMap<>(); 303 static { 304 namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"}); 305 namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"}); 306 // We don't need absolute coverage here as other cases will be inferred by the HtmlTreeBuilder 307 } 308 309 private static void setupTags(String[] tagNames, Consumer<Tag> tagModifier) { 310 for (String tagName : tagNames) { 311 Tag tag = Tags.get(tagName); 312 if (tag == null) { 313 tag = new Tag(tagName, tagName, Parser.NamespaceHtml); 314 Tags.put(tag.tagName, tag); 315 } 316 tagModifier.accept(tag); 317 } 318 } 319 320 static { 321 setupTags(blockTags, tag -> { 322 tag.isBlock = true; 323 tag.formatAsBlock = true; 324 }); 325 326 setupTags(inlineTags, tag -> { 327 tag.isBlock = false; 328 tag.formatAsBlock = false; 329 }); 330 331 setupTags(emptyTags, tag -> tag.empty = true); 332 setupTags(formatAsInlineTags, tag -> tag.formatAsBlock = false); 333 setupTags(preserveWhitespaceTags, tag -> tag.preserveWhitespace = true); 334 setupTags(formListedTags, tag -> tag.formList = true); 335 setupTags(formSubmitTags, tag -> tag.formSubmit = true); 336 for (Map.Entry<String, String[]> ns : namespaces.entrySet()) { 337 setupTags(ns.getValue(), tag -> tag.namespace = ns.getKey()); 338 } 339 } 340}