001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.helper.Validate; 006import org.jsoup.nodes.Document.OutputSettings; 007import org.jsoup.parser.CharacterReader; 008import org.jsoup.parser.Parser; 009import org.jspecify.annotations.Nullable; 010 011import java.io.IOException; 012import java.nio.charset.CharsetEncoder; 013import java.util.Arrays; 014import java.util.HashMap; 015 016import static org.jsoup.nodes.Document.OutputSettings.*; 017import static org.jsoup.nodes.Entities.EscapeMode.base; 018import static org.jsoup.nodes.Entities.EscapeMode.extended; 019 020/** 021 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 022 * HTML named character references</a>. 023 */ 024public class Entities { 025 private static final int empty = -1; 026 private static final String emptyName = ""; 027 static final int codepointRadix = 36; 028 private static final char[] codeDelims = {',', ';'}; 029 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 030 031 public enum EscapeMode { 032 /** 033 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 034 */ 035 xhtml(EntitiesData.xmlPoints, 4), 036 /** 037 * Default HTML output entities. 038 */ 039 base(EntitiesData.basePoints, 106), 040 /** 041 * Complete HTML entities. 042 */ 043 extended(EntitiesData.fullPoints, 2125); 044 045 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 046 private String[] nameKeys; 047 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 048 049 // table of codepoints to named entities. 050 private int[] codeKeys; // we don't support multicodepoints to single named value currently 051 private String[] nameVals; 052 053 EscapeMode(String file, int size) { 054 load(this, file, size); 055 } 056 057 int codepointForName(final String name) { 058 int index = Arrays.binarySearch(nameKeys, name); 059 return index >= 0 ? codeVals[index] : empty; 060 } 061 062 String nameForCodepoint(final int codepoint) { 063 final int index = Arrays.binarySearch(codeKeys, codepoint); 064 if (index >= 0) { 065 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 066 // (and binary search for same item with multi results is undefined 067 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 068 nameVals[index + 1] : nameVals[index]; 069 } 070 return emptyName; 071 } 072 073 private int size() { 074 return nameKeys.length; 075 } 076 } 077 078 private Entities() { 079 } 080 081 /** 082 * Check if the input is a known named entity 083 * 084 * @param name the possible entity name (e.g. "lt" or "amp") 085 * @return true if a known named entity 086 */ 087 public static boolean isNamedEntity(final String name) { 088 return extended.codepointForName(name) != empty; 089 } 090 091 /** 092 * Check if the input is a known named entity in the base entity set. 093 * 094 * @param name the possible entity name (e.g. "lt" or "amp") 095 * @return true if a known named entity in the base set 096 * @see #isNamedEntity(String) 097 */ 098 public static boolean isBaseNamedEntity(final String name) { 099 return base.codepointForName(name) != empty; 100 } 101 102 /** 103 * Get the character(s) represented by the named entity 104 * 105 * @param name entity (e.g. "lt" or "amp") 106 * @return the string value of the character(s) represented by this entity, or "" if not defined 107 */ 108 public static String getByName(String name) { 109 String val = multipoints.get(name); 110 if (val != null) 111 return val; 112 int codepoint = extended.codepointForName(name); 113 if (codepoint != empty) 114 return new String(new int[]{codepoint}, 0, 1); 115 return emptyName; 116 } 117 118 public static int codepointsForName(final String name, final int[] codepoints) { 119 String val = multipoints.get(name); 120 if (val != null) { 121 codepoints[0] = val.codePointAt(0); 122 codepoints[1] = val.codePointAt(1); 123 return 2; 124 } 125 int codepoint = extended.codepointForName(name); 126 if (codepoint != empty) { 127 codepoints[0] = codepoint; 128 return 1; 129 } 130 return 0; 131 } 132 133 /** 134 HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use 135 both in attributes and in text data. 136 @param string the un-escaped string to escape 137 @param out the output settings to use. This configures the character set escaped against (that is, if a 138 character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML 139 settings. 140 @return the escaped string 141 */ 142 public static String escape(String string, OutputSettings out) { 143 if (string == null) 144 return ""; 145 StringBuilder accum = StringUtil.borrowBuilder(); 146 try { 147 escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces 148 } catch (IOException e) { 149 throw new SerializationException(e); // doesn't happen 150 } 151 return StringUtil.releaseBuilder(accum); 152 } 153 154 /** 155 * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as 156 * {@code <}. The escaped string is suitable for use both in attributes and in text data. 157 * 158 * @param string the un-escaped string to escape 159 * @return the escaped string 160 * @see #escape(String, OutputSettings) 161 */ 162 public static String escape(String string) { 163 if (DefaultOutput == null) 164 DefaultOutput = new OutputSettings(); 165 return escape(string, DefaultOutput); 166 } 167 private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings 168 169 // this method does a lot, but other breakups cause rescanning and stringbuilder generations 170 static void escape(Appendable accum, String string, OutputSettings out, 171 boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { 172 173 boolean lastWasWhite = false; 174 boolean reachedNonWhite = false; 175 final EscapeMode escapeMode = out.escapeMode(); 176 final CharsetEncoder encoder = out.encoder(); 177 final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() 178 final int length = string.length(); 179 180 int codePoint; 181 boolean skipped = false; 182 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 183 codePoint = string.codePointAt(offset); 184 185 if (normaliseWhite) { 186 if (StringUtil.isWhitespace(codePoint)) { 187 if (stripLeadingWhite && !reachedNonWhite) continue; 188 if (lastWasWhite) continue; 189 if (trimTrailing) { 190 skipped = true; 191 continue; 192 } 193 accum.append(' '); 194 lastWasWhite = true; 195 continue; 196 } else { 197 lastWasWhite = false; 198 reachedNonWhite = true; 199 if (skipped) { 200 accum.append(' '); // wasn't the end, so need to place a normalized space 201 skipped = false; 202 } 203 } 204 } 205 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 206 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 207 final char c = (char) codePoint; 208 // html specific and required escapes: 209 switch (c) { 210 case '&': 211 accum.append("&"); 212 break; 213 case 0xA0: 214 if (escapeMode != EscapeMode.xhtml) 215 accum.append(" "); 216 else 217 accum.append(" "); 218 break; 219 case '<': 220 // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val 221 if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) 222 accum.append("<"); 223 else 224 accum.append(c); 225 break; 226 case '>': 227 if (forText) 228 accum.append(">"); 229 else 230 accum.append(c); 231 break; 232 case '"': 233 if (forAttribute) 234 accum.append("""); 235 else 236 accum.append(c); 237 break; 238 case '\'': 239 if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape. 240 if (escapeMode == EscapeMode.xhtml) 241 accum.append("'"); 242 else 243 accum.append("'"); 244 } 245 else 246 accum.append(c); 247 break; 248 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 249 case 0x9: 250 case 0xA: 251 case 0xD: 252 accum.append(c); 253 break; 254 default: 255 if (c < 0x20 || !canEncode(coreCharset, c, encoder)) 256 appendEncoded(accum, escapeMode, codePoint); 257 else 258 accum.append(c); 259 } 260 } else { 261 final String c = new String(Character.toChars(codePoint)); 262 if (encoder.canEncode(c)) // uses fallback encoder for simplicity 263 accum.append(c); 264 else 265 appendEncoded(accum, escapeMode, codePoint); 266 } 267 } 268 } 269 270 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 271 final String name = escapeMode.nameForCodepoint(codePoint); 272 if (!emptyName.equals(name)) // ok for identity check 273 accum.append('&').append(name).append(';'); 274 else 275 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 276 } 277 278 /** 279 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 280 * 281 * @param string the HTML string to un-escape 282 * @return the unescaped string 283 */ 284 public static String unescape(String string) { 285 return unescape(string, false); 286 } 287 288 /** 289 * Unescape the input string. 290 * 291 * @param string to un-HTML-escape 292 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 293 * @return unescaped string 294 */ 295 static String unescape(String string, boolean strict) { 296 return Parser.unescapeEntities(string, strict); 297 } 298 299 /* 300 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 301 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 302 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 303 * issues on Android if required. 304 * 305 * Benchmarks: * 306 * OLD toHtml() impl v New (fastpath) in millis 307 * Wiki: 1895, 16 308 * CNN: 6378, 55 309 * Alterslash: 3013, 28 310 * Jsoup: 167, 2 311 */ 312 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 313 // todo add more charset tests if impacted by Android's bad perf in canEncode 314 switch (charset) { 315 case ascii: 316 return c < 0x80; 317 case utf: 318 return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above 319 default: 320 return fallback.canEncode(c); 321 } 322 } 323 324 enum CoreCharset { 325 ascii, utf, fallback; 326 327 static CoreCharset byName(final String name) { 328 if (name.equals("US-ASCII")) 329 return ascii; 330 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 331 return utf; 332 return fallback; 333 } 334 } 335 336 private static void load(EscapeMode e, String pointsData, int size) { 337 e.nameKeys = new String[size]; 338 e.codeVals = new int[size]; 339 e.codeKeys = new int[size]; 340 e.nameVals = new String[size]; 341 342 int i = 0; 343 CharacterReader reader = new CharacterReader(pointsData); 344 try { 345 while (!reader.isEmpty()) { 346 // NotNestedLessLess=10913,824;1887& 347 348 final String name = reader.consumeTo('='); 349 reader.advance(); 350 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 351 final char codeDelim = reader.current(); 352 reader.advance(); 353 final int cp2; 354 if (codeDelim == ',') { 355 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 356 reader.advance(); 357 } else { 358 cp2 = empty; 359 } 360 final String indexS = reader.consumeTo('&'); 361 final int index = Integer.parseInt(indexS, codepointRadix); 362 reader.advance(); 363 364 e.nameKeys[i] = name; 365 e.codeVals[i] = cp1; 366 e.codeKeys[index] = cp1; 367 e.nameVals[index] = name; 368 369 if (cp2 != empty) { 370 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 371 } 372 i++; 373 } 374 375 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 376 } finally { 377 reader.close(); 378 } 379 } 380}