001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.helper.DataUtil; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.nodes.Document.OutputSettings; 008import org.jsoup.parser.CharacterReader; 009import org.jsoup.parser.Parser; 010 011import java.io.IOException; 012import java.nio.charset.Charset; 013import java.nio.charset.CharsetEncoder; 014import java.util.ArrayList; 015import java.util.Arrays; 016import java.util.Collections; 017import java.util.HashMap; 018 019import static org.jsoup.nodes.Document.OutputSettings.*; 020import static org.jsoup.nodes.Entities.EscapeMode.base; 021import static org.jsoup.nodes.Entities.EscapeMode.extended; 022 023/** 024 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 025 * HTML named character references</a>. 026 */ 027public class Entities { 028 // constants for escape options: 029 static final int ForText = 0x1; 030 static final int ForAttribute = 0x2; 031 static final int Normalise = 0x4; 032 static final int TrimLeading = 0x8; 033 static final int TrimTrailing = 0x10; 034 035 private static final int empty = -1; 036 private static final String emptyName = ""; 037 static final int codepointRadix = 36; 038 private static final char[] codeDelims = {',', ';'}; 039 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 040 041 private static final int BaseCount = 106; 042 private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching 043 044 public enum EscapeMode { 045 /** 046 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 047 */ 048 xhtml(EntitiesData.xmlPoints, 4), 049 /** 050 * Default HTML output entities. 051 */ 052 base(EntitiesData.basePoints, 106), 053 /** 054 * Complete HTML entities. 055 */ 056 extended(EntitiesData.fullPoints, 2125); 057 058 static { 059 // sort the base names by length, for prefix matching 060 Collections.addAll(baseSorted, base.nameKeys); 061 baseSorted.sort((a, b) -> b.length() - a.length()); 062 } 063 064 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 065 private String[] nameKeys; 066 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 067 068 // table of codepoints to named entities. 069 private int[] codeKeys; // we don't support multicodepoints to single named value currently 070 private String[] nameVals; 071 072 EscapeMode(String file, int size) { 073 load(this, file, size); 074 } 075 076 int codepointForName(final String name) { 077 int index = Arrays.binarySearch(nameKeys, name); 078 return index >= 0 ? codeVals[index] : empty; 079 } 080 081 String nameForCodepoint(final int codepoint) { 082 final int index = Arrays.binarySearch(codeKeys, codepoint); 083 if (index >= 0) { 084 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 085 // (and binary search for same item with multi results is undefined 086 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 087 nameVals[index + 1] : nameVals[index]; 088 } 089 return emptyName; 090 } 091 } 092 093 private Entities() { 094 } 095 096 /** 097 * Check if the input is a known named entity 098 * 099 * @param name the possible entity name (e.g. "lt" or "amp") 100 * @return true if a known named entity 101 */ 102 public static boolean isNamedEntity(final String name) { 103 return extended.codepointForName(name) != empty; 104 } 105 106 /** 107 * Check if the input is a known named entity in the base entity set. 108 * 109 * @param name the possible entity name (e.g. "lt" or "amp") 110 * @return true if a known named entity in the base set 111 * @see #isNamedEntity(String) 112 */ 113 public static boolean isBaseNamedEntity(final String name) { 114 return base.codepointForName(name) != empty; 115 } 116 117 /** 118 * Get the character(s) represented by the named entity 119 * 120 * @param name entity (e.g. "lt" or "amp") 121 * @return the string value of the character(s) represented by this entity, or "" if not defined 122 */ 123 public static String getByName(String name) { 124 String val = multipoints.get(name); 125 if (val != null) 126 return val; 127 int codepoint = extended.codepointForName(name); 128 if (codepoint != empty) 129 return new String(new int[]{codepoint}, 0, 1); 130 return emptyName; 131 } 132 133 public static int codepointsForName(final String name, final int[] codepoints) { 134 String val = multipoints.get(name); 135 if (val != null) { 136 codepoints[0] = val.codePointAt(0); 137 codepoints[1] = val.codePointAt(1); 138 return 2; 139 } 140 int codepoint = extended.codepointForName(name); 141 if (codepoint != empty) { 142 codepoints[0] = codepoint; 143 return 1; 144 } 145 return 0; 146 } 147 148 /** 149 Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not". 150 151 @return longest entity name that is a prefix of the input, or "" if no entity matches 152 */ 153 public static String findPrefix(String input) { 154 for (String name : baseSorted) { 155 if (input.startsWith(name)) return name; 156 } 157 return emptyName; 158 // if perf critical, could look at using a Trie vs a scan 159 } 160 161 /** 162 HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use 163 both in attributes and in text data. 164 @param data the un-escaped string to escape 165 @param out the output settings to use. This configures the character set escaped against (that is, if a 166 character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML 167 settings. 168 @return the escaped string 169 */ 170 public static String escape(String data, OutputSettings out) { 171 return escapeString(data, out.escapeMode(), out.syntax(), out.charset()); 172 } 173 174 /** 175 HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is 176 returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. 177 @param data the un-escaped string to escape 178 @return the escaped string 179 @see #escape(String, OutputSettings) 180 */ 181 public static String escape(String data) { 182 return escapeString(data, base, Syntax.html, DataUtil.UTF_8); 183 } 184 185 private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) { 186 if (data == null) 187 return ""; 188 StringBuilder accum = StringUtil.borrowBuilder(); 189 try { 190 doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute); 191 } catch (IOException e) { 192 throw new SerializationException(e); // doesn't happen 193 } 194 return StringUtil.releaseBuilder(accum); 195 } 196 197 198 static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException { 199 doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options); 200 } 201 202 private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException { 203 final CoreCharset coreCharset = CoreCharset.byName(charset.name()); 204 final CharsetEncoder fallback = encoderFor(charset); 205 final int length = data.length(); 206 207 int codePoint; 208 boolean lastWasWhite = false; 209 boolean reachedNonWhite = false; 210 boolean skipped = false; 211 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 212 codePoint = data.codePointAt(offset); 213 214 if ((options & Normalise) != 0) { 215 if (StringUtil.isWhitespace(codePoint)) { 216 if ((options & TrimLeading) != 0 && !reachedNonWhite) continue; 217 if (lastWasWhite) continue; 218 if ((options & TrimTrailing) != 0) { 219 skipped = true; 220 continue; 221 } 222 accum.append(' '); 223 lastWasWhite = true; 224 continue; 225 } else { 226 lastWasWhite = false; 227 reachedNonWhite = true; 228 if (skipped) { 229 accum.append(' '); // wasn't the end, so need to place a normalized space 230 skipped = false; 231 } 232 } 233 } 234 appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback); 235 } 236 } 237 238 private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode, 239 Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException { 240 // specific character range for xml 1.0; drop (not encode) if so 241 if (EscapeMode.xhtml == escapeMode && !isValidXmlChar(codePoint)) { 242 return; 243 } 244 245 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 246 final char c = (char) codePoint; 247 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 248 // html specific and required escapes: 249 switch (c) { 250 case '&': 251 accum.append("&"); 252 break; 253 case 0xA0: 254 appendNbsp(accum, escapeMode); 255 break; 256 case '<': 257 // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val 258 appendLt(accum, options, escapeMode, syntax); 259 break; 260 case '>': 261 if ((options & ForText) != 0) accum.append(">"); 262 else accum.append(c); 263 break; 264 case '"': 265 if ((options & ForAttribute) != 0) accum.append("""); 266 else accum.append(c); 267 break; 268 case '\'': 269 // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. 270 appendApos(accum, options, escapeMode); 271 break; 272 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 273 case 0x9: 274 case 0xA: 275 case 0xD: 276 accum.append(c); 277 break; 278 default: 279 if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint); 280 else accum.append(c); 281 } 282 } else { 283 if (canEncode(coreCharset, c, fallback)) { 284 // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character) 285 char[] chars = charBuf.get(); 286 int len = Character.toChars(codePoint, chars, 0); 287 if (accum instanceof StringBuilder) // true unless the user supplied their own 288 ((StringBuilder) accum).append(chars, 0, len); 289 else 290 accum.append(new String(chars, 0, len)); 291 } else { 292 appendEncoded(accum, escapeMode, codePoint); 293 } 294 } 295 } 296 297 private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]); 298 299 private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException { 300 if (escapeMode != EscapeMode.xhtml) accum.append(" "); 301 else accum.append(" "); 302 } 303 304 private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException { 305 if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("<"); 306 else accum.append('<'); // no need to escape < when in an HTML attribute 307 } 308 309 private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException { 310 if ((options & ForAttribute) != 0 && (options & ForText) != 0) { 311 if (escapeMode == EscapeMode.xhtml) accum.append("'"); 312 else accum.append("'"); 313 } else { 314 accum.append('\''); 315 } 316 } 317 318 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 319 final String name = escapeMode.nameForCodepoint(codePoint); 320 if (!emptyName.equals(name)) // ok for identity check 321 accum.append('&').append(name).append(';'); 322 else 323 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 324 } 325 326 /** 327 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 328 * 329 * @param string the HTML string to un-escape 330 * @return the unescaped string 331 */ 332 public static String unescape(String string) { 333 return unescape(string, false); 334 } 335 336 /** 337 * Unescape the input string. 338 * 339 * @param string to un-HTML-escape 340 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 341 * @return unescaped string 342 */ 343 static String unescape(String string, boolean strict) { 344 return Parser.unescapeEntities(string, strict); 345 } 346 347 /* 348 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 349 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 350 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 351 * issues on Android if required. 352 * 353 * Benchmarks: * 354 * OLD toHtml() impl v New (fastpath) in millis 355 * Wiki: 1895, 16 356 * CNN: 6378, 55 357 * Alterslash: 3013, 28 358 * Jsoup: 167, 2 359 */ 360 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 361 // todo add more charset tests if impacted by Android's bad perf in canEncode 362 switch (charset) { 363 case ascii: 364 return c < 0x80; 365 case utf: 366 return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar 367 default: 368 return fallback.canEncode(c); 369 } 370 } 371 372 private static boolean isValidXmlChar(int codePoint) { 373 // https://www.w3.org/TR/2006/REC-xml-20060816/Overview.html#charsets 374 // Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. 375 return (codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || (codePoint >= 0x20 && codePoint <= 0xD7FF) 376 || (codePoint >= 0xE000 && codePoint <= 0xFFFD) || (codePoint >= 0x10000 && codePoint <= 0x10FFFF)); 377 } 378 379 enum CoreCharset { 380 ascii, utf, fallback; 381 382 static CoreCharset byName(final String name) { 383 if (name.equals("US-ASCII")) 384 return ascii; 385 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 386 return utf; 387 return fallback; 388 } 389 } 390 391 // cache the last used fallback encoder to save recreating on every use 392 private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>(); 393 private static CharsetEncoder encoderFor(Charset charset) { 394 CharsetEncoder encoder = LocalEncoder.get(); 395 if (encoder == null || !encoder.charset().equals(charset)) { 396 encoder = charset.newEncoder(); 397 LocalEncoder.set(encoder); 398 } 399 return encoder; 400 } 401 402 private static void load(EscapeMode e, String pointsData, int size) { 403 e.nameKeys = new String[size]; 404 e.codeVals = new int[size]; 405 e.codeKeys = new int[size]; 406 e.nameVals = new String[size]; 407 408 int i = 0; 409 CharacterReader reader = new CharacterReader(pointsData); 410 try { 411 while (!reader.isEmpty()) { 412 // NotNestedLessLess=10913,824;1887& 413 414 final String name = reader.consumeTo('='); 415 reader.advance(); 416 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 417 final char codeDelim = reader.current(); 418 reader.advance(); 419 final int cp2; 420 if (codeDelim == ',') { 421 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 422 reader.advance(); 423 } else { 424 cp2 = empty; 425 } 426 final String indexS = reader.consumeTo('&'); 427 final int index = Integer.parseInt(indexS, codepointRadix); 428 reader.advance(); 429 430 e.nameKeys[i] = name; 431 e.codeVals[i] = cp1; 432 e.codeKeys[index] = cp1; 433 e.nameVals[index] = name; 434 435 if (cp2 != empty) { 436 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 437 } 438 i++; 439 } 440 441 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 442 } finally { 443 reader.close(); 444 } 445 } 446}