001package org.jsoup.nodes;
002
003import org.jsoup.SerializationException;
004import org.jsoup.helper.DataUtil;
005import org.jsoup.internal.StringUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.nodes.Document.OutputSettings;
008import org.jsoup.parser.CharacterReader;
009import org.jsoup.parser.Parser;
010
011import java.io.IOException;
012import java.nio.charset.Charset;
013import java.nio.charset.CharsetEncoder;
014import java.util.ArrayList;
015import java.util.Arrays;
016import java.util.Collections;
017import java.util.HashMap;
018
019import static org.jsoup.nodes.Document.OutputSettings.*;
020import static org.jsoup.nodes.Entities.EscapeMode.base;
021import static org.jsoup.nodes.Entities.EscapeMode.extended;
022
023/**
024 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
025 * HTML named character references</a>.
026 */
027public class Entities {
028    // constants for escape options:
029    static final int ForText = 0x1;
030    static final int ForAttribute = 0x2;
031    static final int Normalise = 0x4;
032    static final int TrimLeading = 0x8;
033    static final int TrimTrailing = 0x10;
034
035    private static final int empty = -1;
036    private static final String emptyName = "";
037    static final int codepointRadix = 36;
038    private static final char[] codeDelims = {',', ';'};
039    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
040
041    private static final int BaseCount = 106;
042    private static final ArrayList<String> baseSorted = new ArrayList<>(BaseCount); // names sorted longest first, for prefix matching
043
044    public enum EscapeMode {
045        /**
046         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
047         */
048        xhtml(EntitiesData.xmlPoints, 4),
049        /**
050         * Default HTML output entities.
051         */
052        base(EntitiesData.basePoints, 106),
053        /**
054         * Complete HTML entities.
055         */
056        extended(EntitiesData.fullPoints, 2125);
057
058        static {
059            // sort the base names by length, for prefix matching
060            Collections.addAll(baseSorted, base.nameKeys);
061            baseSorted.sort((a, b) -> b.length() - a.length());
062        }
063
064        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
065        private String[] nameKeys;
066        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
067
068        // table of codepoints to named entities.
069        private int[] codeKeys; // we don't support multicodepoints to single named value currently
070        private String[] nameVals;
071
072        EscapeMode(String file, int size) {
073            load(this, file, size);
074        }
075
076        int codepointForName(final String name) {
077            int index = Arrays.binarySearch(nameKeys, name);
078            return index >= 0 ? codeVals[index] : empty;
079        }
080
081        String nameForCodepoint(final int codepoint) {
082            final int index = Arrays.binarySearch(codeKeys, codepoint);
083            if (index >= 0) {
084                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
085                // (and binary search for same item with multi results is undefined
086                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
087                    nameVals[index + 1] : nameVals[index];
088            }
089            return emptyName;
090        }
091    }
092
093    private Entities() {
094    }
095
096    /**
097     * Check if the input is a known named entity
098     *
099     * @param name the possible entity name (e.g. "lt" or "amp")
100     * @return true if a known named entity
101     */
102    public static boolean isNamedEntity(final String name) {
103        return extended.codepointForName(name) != empty;
104    }
105
106    /**
107     * Check if the input is a known named entity in the base entity set.
108     *
109     * @param name the possible entity name (e.g. "lt" or "amp")
110     * @return true if a known named entity in the base set
111     * @see #isNamedEntity(String)
112     */
113    public static boolean isBaseNamedEntity(final String name) {
114        return base.codepointForName(name) != empty;
115    }
116
117    /**
118     * Get the character(s) represented by the named entity
119     *
120     * @param name entity (e.g. "lt" or "amp")
121     * @return the string value of the character(s) represented by this entity, or "" if not defined
122     */
123    public static String getByName(String name) {
124        String val = multipoints.get(name);
125        if (val != null)
126            return val;
127        int codepoint = extended.codepointForName(name);
128        if (codepoint != empty)
129            return new String(new int[]{codepoint}, 0, 1);
130        return emptyName;
131    }
132
133    public static int codepointsForName(final String name, final int[] codepoints) {
134        String val = multipoints.get(name);
135        if (val != null) {
136            codepoints[0] = val.codePointAt(0);
137            codepoints[1] = val.codePointAt(1);
138            return 2;
139        }
140        int codepoint = extended.codepointForName(name);
141        if (codepoint != empty) {
142            codepoints[0] = codepoint;
143            return 1;
144        }
145        return 0;
146    }
147
148    /**
149     Finds the longest base named entity that is a prefix of the input. That is, input "notit" would return "not".
150
151     @return longest entity name that is a prefix of the input, or "" if no entity matches
152     */
153    public static String findPrefix(String input) {
154        for (String name : baseSorted) {
155            if (input.startsWith(name)) return name;
156        }
157        return emptyName;
158        // if perf critical, could look at using a Trie vs a scan
159    }
160
161    /**
162     HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
163     both in attributes and in text data.
164     @param data the un-escaped string to escape
165     @param out the output settings to use. This configures the character set escaped against (that is, if a
166     character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
167     settings.
168     @return the escaped string
169     */
170    public static String escape(String data, OutputSettings out) {
171        return escapeString(data, out.escapeMode(), out.syntax(), out.charset());
172    }
173
174    /**
175     HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is
176     returned as {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
177     @param data the un-escaped string to escape
178     @return the escaped string
179     @see #escape(String, OutputSettings)
180     */
181    public static String escape(String data) {
182        return escapeString(data, base, Syntax.html, DataUtil.UTF_8);
183    }
184
185    private static String escapeString(String data, EscapeMode escapeMode, Syntax syntax, Charset charset) {
186        if (data == null)
187            return "";
188        StringBuilder accum = StringUtil.borrowBuilder();
189        try {
190            doEscape(data, accum, escapeMode, syntax, charset, ForText | ForAttribute);
191        } catch (IOException e) {
192            throw new SerializationException(e); // doesn't happen
193        }
194        return StringUtil.releaseBuilder(accum);
195    }
196
197
198    static void escape(Appendable accum, String data, OutputSettings out, int options) throws IOException {
199        doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options);
200    }
201
202    private static void doEscape(String data, Appendable accum, EscapeMode mode, Syntax syntax, Charset charset, int options) throws IOException {
203        final CoreCharset coreCharset = CoreCharset.byName(charset.name());
204        final CharsetEncoder fallback = encoderFor(charset);
205        final int length = data.length();
206
207        int codePoint;
208        boolean lastWasWhite = false;
209        boolean reachedNonWhite = false;
210        boolean skipped = false;
211        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
212            codePoint = data.codePointAt(offset);
213
214            if ((options & Normalise) != 0) {
215                if (StringUtil.isWhitespace(codePoint)) {
216                    if ((options & TrimLeading) != 0 && !reachedNonWhite) continue;
217                    if (lastWasWhite) continue;
218                    if ((options & TrimTrailing) != 0) {
219                        skipped = true;
220                        continue;
221                    }
222                    accum.append(' ');
223                    lastWasWhite = true;
224                    continue;
225                } else {
226                    lastWasWhite = false;
227                    reachedNonWhite = true;
228                    if (skipped) {
229                        accum.append(' '); // wasn't the end, so need to place a normalized space
230                        skipped = false;
231                    }
232                }
233            }
234            appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback);
235        }
236    }
237
238    private static void appendEscaped(int codePoint, Appendable accum, int options, EscapeMode escapeMode,
239        Syntax syntax, CoreCharset coreCharset, CharsetEncoder fallback) throws IOException {
240        // specific character range for xml 1.0; drop (not encode) if so
241        if (EscapeMode.xhtml == escapeMode && !isValidXmlChar(codePoint)) {
242            return;
243        }
244
245        // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
246        final char c = (char) codePoint;
247        if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
248            // html specific and required escapes:
249            switch (c) {
250                case '&':
251                    accum.append("&amp;");
252                    break;
253                case 0xA0:
254                    appendNbsp(accum, escapeMode);
255                    break;
256                case '<':
257                    // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
258                    appendLt(accum, options, escapeMode, syntax);
259                    break;
260                case '>':
261                    if ((options & ForText) != 0) accum.append("&gt;");
262                    else accum.append(c);
263                    break;
264                case '"':
265                    if ((options & ForAttribute) != 0) accum.append("&quot;");
266                    else accum.append(c);
267                    break;
268                case '\'':
269                    // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
270                    appendApos(accum, options, escapeMode);
271                    break;
272                // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
273                case 0x9:
274                case 0xA:
275                case 0xD:
276                    accum.append(c);
277                    break;
278                default:
279                    if (c < 0x20 || !canEncode(coreCharset, c, fallback)) appendEncoded(accum, escapeMode, codePoint);
280                    else accum.append(c);
281            }
282        } else {
283            if (canEncode(coreCharset, c, fallback)) {
284                // reads into charBuf - we go through these steps to avoid GC objects as much as possible (would be a new String and a new char[2] for each character)
285                char[] chars = charBuf.get();
286                int len = Character.toChars(codePoint, chars, 0);
287                if (accum instanceof StringBuilder) // true unless the user supplied their own
288                    ((StringBuilder) accum).append(chars, 0, len);
289                else
290                    accum.append(new String(chars, 0, len));
291            } else {
292                appendEncoded(accum, escapeMode, codePoint);
293            }
294        }
295    }
296
297    private static final ThreadLocal<char[]> charBuf = ThreadLocal.withInitial(() -> new char[2]);
298
299    private static void appendNbsp(Appendable accum, EscapeMode escapeMode) throws IOException {
300        if (escapeMode != EscapeMode.xhtml) accum.append("&nbsp;");
301        else accum.append("&#xa0;");
302    }
303
304    private static void appendLt(Appendable accum, int options, EscapeMode escapeMode, Syntax syntax) throws IOException {
305        if ((options & ForText) != 0 || escapeMode == EscapeMode.xhtml || syntax == Syntax.xml) accum.append("&lt;");
306        else accum.append('<'); // no need to escape < when in an HTML attribute
307    }
308
309    private static void appendApos(Appendable accum, int options, EscapeMode escapeMode) throws IOException {
310        if ((options & ForAttribute) != 0 && (options & ForText) != 0) {
311            if (escapeMode == EscapeMode.xhtml) accum.append("&#x27;");
312            else accum.append("&apos;");
313        } else {
314            accum.append('\'');
315        }
316    }
317
318    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
319        final String name = escapeMode.nameForCodepoint(codePoint);
320        if (!emptyName.equals(name)) // ok for identity check
321            accum.append('&').append(name).append(';');
322        else
323            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
324    }
325
326    /**
327     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
328     *
329     * @param string the HTML string to un-escape
330     * @return the unescaped string
331     */
332    public static String unescape(String string) {
333        return unescape(string, false);
334    }
335
336    /**
337     * Unescape the input string.
338     *
339     * @param string to un-HTML-escape
340     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
341     * @return unescaped string
342     */
343    static String unescape(String string, boolean strict) {
344        return Parser.unescapeEntities(string, strict);
345    }
346
347    /*
348     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
349     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
350     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
351     * issues on Android if required.
352     *
353     * Benchmarks:     *
354     * OLD toHtml() impl v New (fastpath) in millis
355     * Wiki: 1895, 16
356     * CNN: 6378, 55
357     * Alterslash: 3013, 28
358     * Jsoup: 167, 2
359     */
360    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
361        // todo add more charset tests if impacted by Android's bad perf in canEncode
362        switch (charset) {
363            case ascii:
364                return c < 0x80;
365            case utf:
366                return !(c >= Character.MIN_SURROGATE && c < (Character.MAX_SURROGATE + 1)); // !Character.isSurrogate(c); but not in Android 10 desugar
367            default:
368                return fallback.canEncode(c);
369        }
370    }
371
372    private static boolean isValidXmlChar(int codePoint) {
373        // https://www.w3.org/TR/2006/REC-xml-20060816/Overview.html#charsets
374        // Char    ::=          #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]  any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
375        return (codePoint == 0x9 || codePoint == 0xA || codePoint == 0xD || (codePoint >= 0x20 && codePoint <= 0xD7FF)
376            || (codePoint >= 0xE000 && codePoint <= 0xFFFD) || (codePoint >= 0x10000 && codePoint <= 0x10FFFF));
377    }
378
379    enum CoreCharset {
380        ascii, utf, fallback;
381
382        static CoreCharset byName(final String name) {
383            if (name.equals("US-ASCII"))
384                return ascii;
385            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
386                return utf;
387            return fallback;
388        }
389    }
390
391    // cache the last used fallback encoder to save recreating on every use
392    private static final ThreadLocal<CharsetEncoder> LocalEncoder = new ThreadLocal<>();
393    private static CharsetEncoder encoderFor(Charset charset) {
394        CharsetEncoder encoder = LocalEncoder.get();
395        if (encoder == null || !encoder.charset().equals(charset)) {
396            encoder = charset.newEncoder();
397            LocalEncoder.set(encoder);
398        }
399        return encoder;
400    }
401
402    private static void load(EscapeMode e, String pointsData, int size) {
403        e.nameKeys = new String[size];
404        e.codeVals = new int[size];
405        e.codeKeys = new int[size];
406        e.nameVals = new String[size];
407
408        int i = 0;
409        CharacterReader reader = new CharacterReader(pointsData);
410        try {
411            while (!reader.isEmpty()) {
412                // NotNestedLessLess=10913,824;1887&
413
414                final String name = reader.consumeTo('=');
415                reader.advance();
416                final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
417                final char codeDelim = reader.current();
418                reader.advance();
419                final int cp2;
420                if (codeDelim == ',') {
421                    cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
422                    reader.advance();
423                } else {
424                    cp2 = empty;
425                }
426                final String indexS = reader.consumeTo('&');
427                final int index = Integer.parseInt(indexS, codepointRadix);
428                reader.advance();
429
430                e.nameKeys[i] = name;
431                e.codeVals[i] = cp1;
432                e.codeKeys[index] = cp1;
433                e.nameVals[index] = name;
434
435                if (cp2 != empty) {
436                    multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
437                }
438                i++;
439            }
440
441            Validate.isTrue(i == size, "Unexpected count of entities loaded");
442        } finally {
443            reader.close();
444        }
445    }
446}