001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.helper.DataUtil; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.nodes.Document.OutputSettings; 008import org.jsoup.parser.CharacterReader; 009import org.jsoup.parser.Parser; 010 011import java.io.IOException; 012import java.nio.charset.Charset; 013import java.nio.charset.CharsetEncoder; 014import java.util.ArrayList; 015import java.util.Arrays; 016import java.util.Collections; 017import java.util.HashMap; 018 019import static org.jsoup.nodes.Document.OutputSettings.*; 020import static org.jsoup.nodes.Entities.EscapeMode.base; 021import static org.jsoup.nodes.Entities.EscapeMode.extended; 022 023/** 024 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 025 * HTML named character references</a>. 026 */ 027public class Entities { 028 // constants for escape options: 029 static final int ForText = 0x1; 030 static final int ForAttribute = 0x2; 031 static final int Normalise = 0x4; 032 static final int TrimLeading = 0x8; 033 static final int TrimTrailing = 0x10; 034 035 private static final int empty = -1; 036 private static final String emptyName = ""; 037 static final int codepointRadix = 36; 038 private static final char[] codeDelims = {',', ';'}; 039 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 040 041 private static final int BaseCount = 106; 042 private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching 043 044 public enum EscapeMode { 045 /** 046 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 047 */ 048 xhtml(EntitiesData.xmlPoints, 4), 049 /** 050 * Default HTML output entities. 051 */ 052 base(EntitiesData.basePoints, 106), 053 /** 054 * Complete HTML entities. 055 */ 056 extended(EntitiesData.fullPoints, 2125); 057 058 static { 059 // sort the base names by length, for prefix matching 060 Collections.addAll(baseSorted, base.nameKeys); 061 baseSorted.sort((a, b) -> b.length() - a.length()); 062 } 063 064 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 065 private String[] nameKeys; 066 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 067 068 // table of codepoints to named entities. 069 private int[] codeKeys; // we don't support multicodepoints to single named value currently 070 private String[] nameVals; 071 072 EscapeMode(String file, int size) { 073 load(this, file, size); 074 } 075 076 int codepointForName(final String name) { 077 int index = Arrays.binarySearch(nameKeys, name); 078 return index >= 0 ? codeVals[index] : empty; 079 } 080 081 String nameForCodepoint(final int codepoint) { 082 final int index = Arrays.binarySearch(codeKeys, codepoint); 083 if (index >= 0) { 084 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 085 // (and binary search for same item with multi results is undefined 086 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 087 nameVals[index + 1] : nameVals[index]; 088 } 089 return emptyName; 090 } 091 } 092 093 private Entities() { 094 } 095 096 /** 097 * Check if the input is a known named entity 098 * 099 * @param name the possible entity name (e.g. "lt" or "amp") 100 * @return true if a known named entity 101 */ 102 public static boolean isNamedEntity(final String name) { 103 return extended.codepointForName(name) != empty; 104 } 105 106 /** 107 * Check if the input is a known named entity in the base entity set. 108 * 109 * @param name the possible entity name (e.g. "lt" or "amp") 110 * @return true if a known named entity in the base set 111 * @see #isNamedEntity(String) 112 */ 113 public static boolean isBaseNamedEntity(final String name) { 114 return base.codepointForName(name) != empty; 115 } 116 117 /** 118 * Get the character(s) represented by the named entity 119 * 120 * @param name entity (e.g. "lt" or "amp") 121 * @return the string value of the character(s) represented by this entity, or "" if not defined 122 */ 123 public static String getByName(String name) { 124 String val = multipoints.get(name); 125 if (val != null) 126 return val; 127 int codepoint = extended.codepointForName(name); 128 if (codepoint != empty) 129 return new String(new int[]{codepoint}, 0, 1); 130 return emptyName; 131 } 132 133 public static int codepointsForName(final String name, final int[] codepoints) { 134 String val = multipoints.get(name); 135 if (val != null) { 136 codepoints[0] = val.codePointAt(0); 137 codepoints[1] = val.codePointAt(1); 138 return 2; 139 } 140 int codepoint = extended.codepointForName(name); 141 if (codepoint != empty) { 142 codepoints[0] = codepoint; 143 return 1; 144 } 145 return 0; 146 } 147 148 /** 149 Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not". 150 151 @return longest entity name that is a prefix of the input, or "" if no entity matches 152 */ 153 public static String findPrefix(String input) { 154 for (String name : baseSorted) { 155 if (input.startsWith(name)) return name; 156 } 157 return emptyName; 158 // if perf critical, could look at using a Trie vs a scan 159 } 160 161 /** 162 HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use 163 both in attributes and in text data. 164 @param data the un-escaped string to escape 165 @param out the output settings to use. This configures the character set escaped against (that is, if a 166 character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML 167 settings. 168 @return the escaped string 169 */ 170 public static String escape(String data, OutputSettings out) { 171 return escapeString(data, out.escapeMode(), out.syntax(), out.charset()); 172 } 173 174 /** 175 HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is 176 returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. 177 @param data the un-escaped string to escape 178 @return the escaped string 179 @see #escape(String, OutputSettings) 180 */ 181 public static String escape(String data) { 182 return escapeString(data, base, Syntax.html, DataUtil.UTF_8); 183 } 184 185 private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) { 186 if (data == null) 187 return ""; 188 StringBuilder accum = StringUtil.borrowBuilder(); 189 try { 190 doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute); 191 } catch (IOException e) { 192 throw new SerializationException(e); // doesn't happen 193 } 194 return StringUtil.releaseBuilder(accum); 195 } 196 197 198 static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException { 199 doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options); 200 } 201 202 private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException { 203 final CoreCharset coreCharset = CoreCharset.byName(charset.name()); 204 final CharsetEncoder fallback = encoderFor(charset); 205 final int length = data.length(); 206 207 int codePoint; 208 boolean lastWasWhite = false; 209 boolean reachedNonWhite = false; 210 boolean skipped = false; 211 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 212 codePoint = data.codePointAt(offset); 213 214 if ((options & Normalise) != 0) { 215 if (StringUtil.isWhitespace(codePoint)) { 216 if ((options & TrimLeading) != 0 && !reachedNonWhite) continue; 217 if (lastWasWhite) continue; 218 if ((options & TrimTrailing) != 0) { 219 skipped = true; 220 continue; 221 } 222 accum.append(' '); 223 lastWasWhite = true; 224 continue; 225 } else { 226 lastWasWhite = false; 227 reachedNonWhite = true; 228 if (skipped) { 229 accum.append(' '); // wasn't the end, so need to place a normalized space 230 skipped = false; 231 } 232 } 233 } 234 appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback); 235 } 236 } 237 238 private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode, 239 Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException { 240 241 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 242 final char c = (char) codePoint; 243 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 244 // html specific and required escapes: 245 switch (c) { 246 case '&': 247 accum.append("&"); 248 break; 249 case 0xA0: 250 appendNbsp(accum, escapeMode); 251 break; 252 case '<': 253 // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val 254 appendLt(accum, options, escapeMode, syntax); 255 break; 256 case '>': 257 if ((options & ForText) != 0) accum.append(">"); 258 else accum.append(c); 259 break; 260 case '"': 261 if ((options & ForAttribute) != 0) accum.append("""); 262 else accum.append(c); 263 break; 264 case '\'': 265 // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. 266 appendApos(accum, options, escapeMode); 267 break; 268 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 269 case 0x9: 270 case 0xA: 271 case 0xD: 272 accum.append(c); 273 break; 274 default: 275 if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint); 276 else accum.append(c); 277 } 278 } else { 279 if (canEncode(coreCharset, c, fallback)) { 280 // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character) 281 char[] chars = charBuf.get(); 282 int len = Character.toChars(codePoint, chars, 0); 283 if (accum instanceof StringBuilder) // true unless the user supplied their own 284 ((StringBuilder) accum).append(chars, 0, len); 285 else 286 accum.append(new String(chars, 0, len)); 287 } else { 288 appendEncoded(accum, escapeMode, codePoint); 289 } 290 } 291 } 292 293 private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]); 294 295 private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException { 296 if (escapeMode != EscapeMode.xhtml) accum.append(" "); 297 else accum.append(" "); 298 } 299 300 private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException { 301 if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("<"); 302 else accum.append('<'); // no need to escape < when in an HTML attribute 303 } 304 305 private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException { 306 if ((options & ForAttribute) != 0 && (options & ForText) != 0) { 307 if (escapeMode == EscapeMode.xhtml) accum.append("'"); 308 else accum.append("'"); 309 } else { 310 accum.append('\''); 311 } 312 } 313 314 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 315 final String name = escapeMode.nameForCodepoint(codePoint); 316 if (!emptyName.equals(name)) // ok for identity check 317 accum.append('&').append(name).append(';'); 318 else 319 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 320 } 321 322 /** 323 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 324 * 325 * @param string the HTML string to un-escape 326 * @return the unescaped string 327 */ 328 public static String unescape(String string) { 329 return unescape(string, false); 330 } 331 332 /** 333 * Unescape the input string. 334 * 335 * @param string to un-HTML-escape 336 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 337 * @return unescaped string 338 */ 339 static String unescape(String string, boolean strict) { 340 return Parser.unescapeEntities(string, strict); 341 } 342 343 /* 344 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 345 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 346 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 347 * issues on Android if required. 348 * 349 * Benchmarks: * 350 * OLD toHtml() impl v New (fastpath) in millis 351 * Wiki: 1895, 16 352 * CNN: 6378, 55 353 * Alterslash: 3013, 28 354 * Jsoup: 167, 2 355 */ 356 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 357 // todo add more charset tests if impacted by Android's bad perf in canEncode 358 switch (charset) { 359 case ascii: 360 return c < 0x80; 361 case utf: 362 return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar 363 default: 364 return fallback.canEncode(c); 365 } 366 } 367 368 enum CoreCharset { 369 ascii, utf, fallback; 370 371 static CoreCharset byName(final String name) { 372 if (name.equals("US-ASCII")) 373 return ascii; 374 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 375 return utf; 376 return fallback; 377 } 378 } 379 380 // cache the last used fallback encoder to save recreating on every use 381 private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>(); 382 private static CharsetEncoder encoderFor(Charset charset) { 383 CharsetEncoder encoder = LocalEncoder.get(); 384 if (encoder == null || !encoder.charset().equals(charset)) { 385 encoder = charset.newEncoder(); 386 LocalEncoder.set(encoder); 387 } 388 return encoder; 389 } 390 391 private static void load(EscapeMode e, String pointsData, int size) { 392 e.nameKeys = new String[size]; 393 e.codeVals = new int[size]; 394 e.codeKeys = new int[size]; 395 e.nameVals = new String[size]; 396 397 int i = 0; 398 CharacterReader reader = new CharacterReader(pointsData); 399 try { 400 while (!reader.isEmpty()) { 401 // NotNestedLessLess=10913,824;1887& 402 403 final String name = reader.consumeTo('='); 404 reader.advance(); 405 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 406 final char codeDelim = reader.current(); 407 reader.advance(); 408 final int cp2; 409 if (codeDelim == ',') { 410 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 411 reader.advance(); 412 } else { 413 cp2 = empty; 414 } 415 final String indexS = reader.consumeTo('&'); 416 final int index = Integer.parseInt(indexS, codepointRadix); 417 reader.advance(); 418 419 e.nameKeys[i] = name; 420 e.codeVals[i] = cp1; 421 e.codeKeys[index] = cp1; 422 e.nameVals[index] = name; 423 424 if (cp2 != empty) { 425 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 426 } 427 i++; 428 } 429 430 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 431 } finally { 432 reader.close(); 433 } 434 } 435}