001package org.jsoup.nodes;
002
003import org.jsoup.SerializationException;
004import org.jsoup.helper.DataUtil;
005import org.jsoup.internal.StringUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.nodes.Document.OutputSettings;
008import org.jsoup.parser.CharacterReader;
009import org.jsoup.parser.Parser;
010
011import java.io.IOException;
012import java.nio.charset.Charset;
013import java.nio.charset.CharsetEncoder;
014import java.util.ArrayList;
015import java.util.Arrays;
016import java.util.Collections;
017import java.util.HashMap;
018
019import static org.jsoup.nodes.Document.OutputSettings.*;
020import static org.jsoup.nodes.Entities.EscapeMode.base;
021import static org.jsoup.nodes.Entities.EscapeMode.extended;
022
023/**
024 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
025 * HTML named character references</a>.
026 */
027public class Entities {
028    // constants for escape options:
029    static final int ForText = 0x1;
030    static final int ForAttribute = 0x2;
031    static final int Normalise = 0x4;
032    static final int TrimLeading = 0x8;
033    static final int TrimTrailing = 0x10;
034
035    private static final int empty = -1;
036    private static final String emptyName = "";
037    static final int codepointRadix = 36;
038    private static final char[] codeDelims = {',', ';'};
039    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
040
041    private static final int BaseCount = 106;
042    private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
043
044    public enum EscapeMode {
045        /**
046         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
047         */
048        xhtml(EntitiesData.xmlPoints, 4),
049        /**
050         * Default HTML output entities.
051         */
052        base(EntitiesData.basePoints, 106),
053        /**
054         * Complete HTML entities.
055         */
056        extended(EntitiesData.fullPoints, 2125);
057
058        static {
059            // sort the base names by length, for prefix matching
060            Collections.addAll(baseSorted, base.nameKeys);
061            baseSorted.sort((a, b) -> b.length() - a.length());
062        }
063
064        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
065        private String[] nameKeys;
066        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
067
068        // table of codepoints to named entities.
069        private int[] codeKeys; // we don't support multicodepoints to single named value currently
070        private String[] nameVals;
071
072        EscapeMode(String file, int size) {
073            load(this, file, size);
074        }
075
076        int codepointForName(final String name) {
077            int index = Arrays.binarySearch(nameKeys, name);
078            return index >= 0 ? codeVals[index] : empty;
079        }
080
081        String nameForCodepoint(final int codepoint) {
082            final int index = Arrays.binarySearch(codeKeys, codepoint);
083            if (index >= 0) {
084                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
085                // (and binary search for same item with multi results is undefined
086                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
087                    nameVals[index + 1] : nameVals[index];
088            }
089            return emptyName;
090        }
091    }
092
093    private Entities() {
094    }
095
096    /**
097     * Check if the input is a known named entity
098     *
099     * @param name the possible entity name (e.g. "lt" or "amp")
100     * @return true if a known named entity
101     */
102    public static boolean isNamedEntity(final String name) {
103        return extended.codepointForName(name) != empty;
104    }
105
106    /**
107     * Check if the input is a known named entity in the base entity set.
108     *
109     * @param name the possible entity name (e.g. "lt" or "amp")
110     * @return true if a known named entity in the base set
111     * @see #isNamedEntity(String)
112     */
113    public static boolean isBaseNamedEntity(final String name) {
114        return base.codepointForName(name) != empty;
115    }
116
117    /**
118     * Get the character(s) represented by the named entity
119     *
120     * @param name entity (e.g. "lt" or "amp")
121     * @return the string value of the character(s) represented by this entity, or "" if not defined
122     */
123    public static String getByName(String name) {
124        String val = multipoints.get(name);
125        if (val != null)
126            return val;
127        int codepoint = extended.codepointForName(name);
128        if (codepoint != empty)
129            return new String(new int[]{codepoint}, 0, 1);
130        return emptyName;
131    }
132
133    public static int codepointsForName(final String name, final int[] codepoints) {
134        String val = multipoints.get(name);
135        if (val != null) {
136            codepoints[0] = val.codePointAt(0);
137            codepoints[1] = val.codePointAt(1);
138            return 2;
139        }
140        int codepoint = extended.codepointForName(name);
141        if (codepoint != empty) {
142            codepoints[0] = codepoint;
143            return 1;
144        }
145        return 0;
146    }
147
148    /**
149     Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
150
151     @return longest entity name that is a prefix of the input, or "" if no entity matches
152     */
153    public static String findPrefix(String input) {
154        for (String name : baseSorted) {
155            if (input.startsWith(name)) return name;
156        }
157        return emptyName;
158        // if perf critical, could look at using a Trie vs a scan
159    }
160
161    /**
162     HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
163     both in attributes and in text data.
164     @param data the un-escaped string to escape
165     @param out the output settings to use. This configures the character set escaped against (that is, if a
166     character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
167     settings.
168     @return the escaped string
169     */
170    public static String escape(String data, OutputSettings out) {
171        return escapeString(data, out.escapeMode(), out.syntax(), out.charset());
172    }
173
174    /**
175     HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is
176     returned as {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
177     @param data the un-escaped string to escape
178     @return the escaped string
179     @see #escape(String, OutputSettings)
180     */
181    public static String escape(String data) {
182        return escapeString(data, base, Syntax.html, DataUtil.UTF_8);
183    }
184
185    private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) {
186        if (data == null)
187            return "";
188        StringBuilder accum = StringUtil.borrowBuilder();
189        try {
190            doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute);
191        } catch (IOException e) {
192            throw new SerializationException(e); // doesn't happen
193        }
194        return StringUtil.releaseBuilder(accum);
195    }
196
197
198    static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException {
199        doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options);
200    }
201
202    private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException {
203        final CoreCharset coreCharset = CoreCharset.byName(charset.name());
204        final CharsetEncoder fallback = encoderFor(charset);
205        final int length = data.length();
206
207        int codePoint;
208        boolean lastWasWhite = false;
209        boolean reachedNonWhite = false;
210        boolean skipped = false;
211        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
212            codePoint = data.codePointAt(offset);
213
214            if ((options & Normalise) != 0) {
215                if (StringUtil.isWhitespace(codePoint)) {
216                    if ((options & TrimLeading) != 0 && !reachedNonWhite) continue;
217                    if (lastWasWhite) continue;
218                    if ((options & TrimTrailing) != 0) {
219                        skipped = true;
220                        continue;
221                    }
222                    accum.append(' ');
223                    lastWasWhite = true;
224                    continue;
225                } else {
226                    lastWasWhite = false;
227                    reachedNonWhite = true;
228                    if (skipped) {
229                        accum.append(' '); // wasn't the end, so need to place a normalized space
230                        skipped = false;
231                    }
232                }
233            }
234            appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback);
235        }
236    }
237
238    private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode,
239        Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException {
240
241        // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
242        final char c = (char) codePoint;
243        if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
244            // html specific and required escapes:
245            switch (c) {
246                case '&':
247                    accum.append("&amp;");
248                    break;
249                case 0xA0:
250                    appendNbsp(accum, escapeMode);
251                    break;
252                case '<':
253                    // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
254                    appendLt(accum, options, escapeMode, syntax);
255                    break;
256                case '>':
257                    if ((options & ForText) != 0) accum.append("&gt;");
258                    else accum.append(c);
259                    break;
260                case '"':
261                    if ((options & ForAttribute) != 0) accum.append("&quot;");
262                    else accum.append(c);
263                    break;
264                case '\'':
265                    // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
266                    appendApos(accum, options, escapeMode);
267                    break;
268                // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
269                case 0x9:
270                case 0xA:
271                case 0xD:
272                    accum.append(c);
273                    break;
274                default:
275                    if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint);
276                    else accum.append(c);
277            }
278        } else {
279            if (canEncode(coreCharset, c, fallback)) {
280                // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character)
281                char[] chars = charBuf.get();
282                int len = Character.toChars(codePoint, chars, 0);
283                if (accum instanceof StringBuilder) // true unless the user supplied their own
284                    ((StringBuilder) accum).append(chars, 0, len);
285                else
286                    accum.append(new String(chars, 0, len));
287            } else {
288                appendEncoded(accum, escapeMode, codePoint);
289            }
290        }
291    }
292
293    private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]);
294
295    private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException {
296        if (escapeMode != EscapeMode.xhtml) accum.append("&nbsp;");
297        else accum.append("&#xa0;");
298    }
299
300    private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException {
301        if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("&lt;");
302        else accum.append('<'); // no need to escape < when in an HTML attribute
303    }
304
305    private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException {
306        if ((options & ForAttribute) != 0 && (options & ForText) != 0) {
307            if (escapeMode == EscapeMode.xhtml) accum.append("&#x27;");
308            else accum.append("&apos;");
309        } else {
310            accum.append('\'');
311        }
312    }
313
314    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
315        final String name = escapeMode.nameForCodepoint(codePoint);
316        if (!emptyName.equals(name)) // ok for identity check
317            accum.append('&').append(name).append(';');
318        else
319            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
320    }
321
322    /**
323     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
324     *
325     * @param string the HTML string to un-escape
326     * @return the unescaped string
327     */
328    public static String unescape(String string) {
329        return unescape(string, false);
330    }
331
332    /**
333     * Unescape the input string.
334     *
335     * @param string to un-HTML-escape
336     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
337     * @return unescaped string
338     */
339    static String unescape(String string, boolean strict) {
340        return Parser.unescapeEntities(string, strict);
341    }
342
343    /*
344     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
345     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
346     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
347     * issues on Android if required.
348     *
349     * Benchmarks:     *
350     * OLD toHtml() impl v New (fastpath) in millis
351     * Wiki: 1895, 16
352     * CNN: 6378, 55
353     * Alterslash: 3013, 28
354     * Jsoup: 167, 2
355     */
356    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
357        // todo add more charset tests if impacted by Android's bad perf in canEncode
358        switch (charset) {
359            case ascii:
360                return c < 0x80;
361            case utf:
362                return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar
363            default:
364                return fallback.canEncode(c);
365        }
366    }
367
368    enum CoreCharset {
369        ascii, utf, fallback;
370
371        static CoreCharset byName(final String name) {
372            if (name.equals("US-ASCII"))
373                return ascii;
374            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
375                return utf;
376            return fallback;
377        }
378    }
379
380    // cache the last used fallback encoder to save recreating on every use
381    private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>();
382    private static CharsetEncoder encoderFor(Charset charset) {
383        CharsetEncoder encoder = LocalEncoder.get();
384        if (encoder == null || !encoder.charset().equals(charset)) {
385            encoder = charset.newEncoder();
386            LocalEncoder.set(encoder);
387        }
388        return encoder;
389    }
390
391    private static void load(EscapeMode e, String pointsData, int size) {
392        e.nameKeys = new String[size];
393        e.codeVals = new int[size];
394        e.codeKeys = new int[size];
395        e.nameVals = new String[size];
396
397        int i = 0;
398        CharacterReader reader = new CharacterReader(pointsData);
399        try {
400            while (!reader.isEmpty()) {
401                // NotNestedLessLess=10913,824;1887&
402
403                final String name = reader.consumeTo('=');
404                reader.advance();
405                final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
406                final char codeDelim = reader.current();
407                reader.advance();
408                final int cp2;
409                if (codeDelim == ',') {
410                    cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
411                    reader.advance();
412                } else {
413                    cp2 = empty;
414                }
415                final String indexS = reader.consumeTo('&');
416                final int index = Integer.parseInt(indexS, codepointRadix);
417                reader.advance();
418
419                e.nameKeys[i] = name;
420                e.codeVals[i] = cp1;
421                e.codeKeys[index] = cp1;
422                e.nameVals[index] = name;
423
424                if (cp2 != empty) {
425                    multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
426                }
427                i++;
428            }
429
430            Validate.isTrue(i == size, "Unexpected count of entities loaded");
431        } finally {
432            reader.close();
433        }
434    }
435}