001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.SharedConstants; 006 007import java.util.HashMap; 008import java.util.Map; 009import java.util.Objects; 010import java.util.function.Consumer; 011 012/** 013 * Tag capabilities. 014 * 015 * @author Jonathan Hedley, jonathan@hedley.net 016 */ 017public class Tag implements Cloneable { 018 private static final Map<String, Tag> Tags = new HashMap<>(); // map of known tags 019 020 private String tagName; 021 private final String normalName; // always the lower case version of this tag, regardless of case preservation mode 022 private String namespace; 023 private boolean isBlock = true; // block 024 private boolean formatAsBlock = true; // should be formatted as a block 025 private boolean empty = false; // can hold nothing; e.g. img 026 private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. 027 private boolean preserveWhitespace = false; // for pre, textarea, script etc 028 private boolean formList = false; // a control that appears in forms: input, textarea, output etc 029 private boolean formSubmit = false; // a control that can be submitted in a form: input etc 030 031 private Tag(String tagName, String namespace) { 032 this.tagName = tagName; 033 normalName = Normalizer.lowerCase(tagName); 034 this.namespace = namespace; 035 } 036 037 /** 038 * Get this tag's name. 039 * 040 * @return the tag's name 041 */ 042 public String getName() { 043 return tagName; 044 } 045 046 /** 047 * Get this tag's normalized (lowercased) name. 048 * @return the tag's normal name. 049 */ 050 public String normalName() { 051 return normalName; 052 } 053 054 public String namespace() { 055 return namespace; 056 } 057 058 /** 059 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 060 * <p> 061 * Pre-defined tags (p, div etc) will be ==, but unknown tags are not registered and will only .equals(). 062 * </p> 063 * 064 * @param tagName Name of tag, e.g. "p". Case-insensitive. 065 * @param namespace the namespace for the tag. 066 * @param settings used to control tag name sensitivity 067 * @return The tag, either defined or new generic. 068 */ 069 public static Tag valueOf(String tagName, String namespace, ParseSettings settings) { 070 Validate.notEmpty(tagName); 071 Validate.notNull(namespace); 072 Tag tag = Tags.get(tagName); 073 if (tag != null && tag.namespace.equals(namespace)) 074 return tag; 075 076 tagName = settings.normalizeTag(tagName); // the name we'll use 077 Validate.notEmpty(tagName); 078 String normalName = Normalizer.lowerCase(tagName); // the lower-case name to get tag settings off 079 tag = Tags.get(normalName); 080 if (tag != null && tag.namespace.equals(namespace)) { 081 if (settings.preserveTagCase() && !tagName.equals(normalName)) { 082 tag = tag.clone(); // get a new version vs the static one, so name update doesn't reset all 083 tag.tagName = tagName; 084 } 085 return tag; 086 } 087 088 // not defined: create default; go anywhere, do anything! (incl be inside a <p>) 089 tag = new Tag(tagName, namespace); 090 tag.isBlock = false; 091 092 return tag; 093 } 094 095 /** 096 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 097 * <p> 098 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 099 * </p> 100 * 101 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 102 * @return The tag, either defined or new generic. 103 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 104 */ 105 public static Tag valueOf(String tagName) { 106 return valueOf(tagName, Parser.NamespaceHtml, ParseSettings.preserveCase); 107 } 108 109 /** 110 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 111 * <p> 112 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 113 * </p> 114 * 115 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 116 * @param settings used to control tag name sensitivity 117 * @return The tag, either defined or new generic. 118 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 119 */ 120 public static Tag valueOf(String tagName, ParseSettings settings) { 121 return valueOf(tagName, Parser.NamespaceHtml, settings); 122 } 123 124 /** 125 * Gets if this is a block tag. 126 * 127 * @return if block tag 128 */ 129 public boolean isBlock() { 130 return isBlock; 131 } 132 133 /** 134 * Gets if this tag should be formatted as a block (or as inline) 135 * 136 * @return if should be formatted as block or inline 137 */ 138 public boolean formatAsBlock() { 139 return formatAsBlock; 140 } 141 142 /** 143 * Gets if this tag is an inline tag. 144 * 145 * @return if this tag is an inline tag. 146 */ 147 public boolean isInline() { 148 return !isBlock; 149 } 150 151 /** 152 * Get if this is an empty tag 153 * 154 * @return if this is an empty tag 155 */ 156 public boolean isEmpty() { 157 return empty; 158 } 159 160 /** 161 * Get if this tag is self-closing. 162 * 163 * @return if this tag should be output as self-closing. 164 */ 165 public boolean isSelfClosing() { 166 return empty || selfClosing; 167 } 168 169 /** 170 * Get if this is a pre-defined tag, or was auto created on parsing. 171 * 172 * @return if a known tag 173 */ 174 public boolean isKnownTag() { 175 return Tags.containsKey(tagName); 176 } 177 178 /** 179 * Check if this tagname is a known tag. 180 * 181 * @param tagName name of tag 182 * @return if known HTML tag 183 */ 184 public static boolean isKnownTag(String tagName) { 185 return Tags.containsKey(tagName); 186 } 187 188 /** 189 * Get if this tag should preserve whitespace within child text nodes. 190 * 191 * @return if preserve whitespace 192 */ 193 public boolean preserveWhitespace() { 194 return preserveWhitespace; 195 } 196 197 /** 198 * Get if this tag represents a control associated with a form. E.g. input, textarea, output 199 * @return if associated with a form 200 */ 201 public boolean isFormListed() { 202 return formList; 203 } 204 205 /** 206 * Get if this tag represents an element that should be submitted with a form. E.g. input, option 207 * @return if submittable with a form 208 */ 209 public boolean isFormSubmittable() { 210 return formSubmit; 211 } 212 213 Tag setSelfClosing() { 214 selfClosing = true; 215 return this; 216 } 217 218 @Override 219 public boolean equals(Object o) { 220 if (this == o) return true; 221 if (!(o instanceof Tag)) return false; 222 223 Tag tag = (Tag) o; 224 225 if (!tagName.equals(tag.tagName)) return false; 226 if (empty != tag.empty) return false; 227 if (formatAsBlock != tag.formatAsBlock) return false; 228 if (isBlock != tag.isBlock) return false; 229 if (preserveWhitespace != tag.preserveWhitespace) return false; 230 if (selfClosing != tag.selfClosing) return false; 231 if (formList != tag.formList) return false; 232 return formSubmit == tag.formSubmit; 233 } 234 235 @Override 236 public int hashCode() { 237 return Objects.hash(tagName, isBlock, formatAsBlock, empty, selfClosing, preserveWhitespace, 238 formList, formSubmit); 239 } 240 241 @Override 242 public String toString() { 243 return tagName; 244 } 245 246 @Override 247 protected Tag clone() { 248 try { 249 return (Tag) super.clone(); 250 } catch (CloneNotSupportedException e) { 251 throw new RuntimeException(e); 252 } 253 } 254 255 // internal static initialisers: 256 // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources 257 private static final String[] blockTags = { 258 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 259 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", 260 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 261 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 262 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 263 "svg", "math", "center", "template", 264 "dir", "applet", "marquee", "listing" // deprecated but still known / special handling 265 }; 266 private static final String[] inlineTags = { 267 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 268 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "br", "wbr", "map", "q", 269 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "optgroup", 270 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 271 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 272 "data", "bdi", "s", "strike", "nobr", 273 "rb", // deprecated but still known / special handling 274 "text", // in SVG NS 275 "mi", "mo", "msup", "mn", "mtext" // in MathML NS, to ensure inline 276 }; 277 private static final String[] emptyTags = { 278 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 279 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 280 }; 281 // todo - rework this to format contents as inline; and update html emitter in Element. Same output, just neater. 282 private static final String[] formatAsInlineTags = { 283 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 284 "ins", "del", "s", "button" 285 }; 286 private static final String[] preserveWhitespaceTags = { 287 "pre", "plaintext", "title", "textarea" 288 // script is not here as it is a data node, which always preserve whitespace 289 }; 290 // todo: I think we just need submit tags, and can scrub listed 291 private static final String[] formListedTags = { 292 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 293 }; 294 private static final String[] formSubmitTags = SharedConstants.FormSubmitTags; 295 296 private static final Map<String, String[]> namespaces = new HashMap<>(); 297 static { 298 namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"}); 299 namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"}); 300 // We don't need absolute coverage here as other cases will be inferred by the HtmlTreeBuilder 301 } 302 303 private static void setupTags(String[] tagNames, Consumer<Tag> tagModifier) { 304 for (String tagName : tagNames) { 305 Tag tag = Tags.get(tagName); 306 if (tag == null) { 307 tag = new Tag(tagName, Parser.NamespaceHtml); 308 Tags.put(tag.tagName, tag); 309 } 310 tagModifier.accept(tag); 311 } 312 } 313 314 static { 315 setupTags(blockTags, tag -> { 316 tag.isBlock = true; 317 tag.formatAsBlock = true; 318 }); 319 320 setupTags(inlineTags, tag -> { 321 tag.isBlock = false; 322 tag.formatAsBlock = false; 323 }); 324 325 setupTags(emptyTags, tag -> tag.empty = true); 326 setupTags(formatAsInlineTags, tag -> tag.formatAsBlock = false); 327 setupTags(preserveWhitespaceTags, tag -> tag.preserveWhitespace = true); 328 setupTags(formListedTags, tag -> tag.formList = true); 329 setupTags(formSubmitTags, tag -> tag.formSubmit = true); 330 for (Map.Entry<String, String[]> ns : namespaces.entrySet()) { 331 setupTags(ns.getValue(), tag -> tag.namespace = ns.getKey()); 332 } 333 } 334}