001package org.jsoup.nodes;
002
003import org.jsoup.SerializationException;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.helper.Validate;
006import org.jsoup.nodes.Document.OutputSettings;
007import org.jsoup.parser.CharacterReader;
008import org.jsoup.parser.Parser;
009import org.jspecify.annotations.Nullable;
010
011import java.io.IOException;
012import java.nio.charset.CharsetEncoder;
013import java.util.Arrays;
014import java.util.HashMap;
015
016import static org.jsoup.nodes.Document.OutputSettings.*;
017import static org.jsoup.nodes.Entities.EscapeMode.base;
018import static org.jsoup.nodes.Entities.EscapeMode.extended;
019
020/**
021 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
022 * HTML named character references</a>.
023 */
024public class Entities {
025    private static final int empty = -1;
026    private static final String emptyName = "";
027    static final int codepointRadix = 36;
028    private static final char[] codeDelims = {',', ';'};
029    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
030
031    public enum EscapeMode {
032        /**
033         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
034         */
035        xhtml(EntitiesData.xmlPoints, 4),
036        /**
037         * Default HTML output entities.
038         */
039        base(EntitiesData.basePoints, 106),
040        /**
041         * Complete HTML entities.
042         */
043        extended(EntitiesData.fullPoints, 2125);
044
045        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
046        private String[] nameKeys;
047        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
048
049        // table of codepoints to named entities.
050        private int[] codeKeys; // we don't support multicodepoints to single named value currently
051        private String[] nameVals;
052
053        EscapeMode(String file, int size) {
054            load(this, file, size);
055        }
056
057        int codepointForName(final String name) {
058            int index = Arrays.binarySearch(nameKeys, name);
059            return index >= 0 ? codeVals[index] : empty;
060        }
061
062        String nameForCodepoint(final int codepoint) {
063            final int index = Arrays.binarySearch(codeKeys, codepoint);
064            if (index >= 0) {
065                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
066                // (and binary search for same item with multi results is undefined
067                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
068                    nameVals[index + 1] : nameVals[index];
069            }
070            return emptyName;
071        }
072
073        private int size() {
074            return nameKeys.length;
075        }
076    }
077
078    private Entities() {
079    }
080
081    /**
082     * Check if the input is a known named entity
083     *
084     * @param name the possible entity name (e.g. "lt" or "amp")
085     * @return true if a known named entity
086     */
087    public static boolean isNamedEntity(final String name) {
088        return extended.codepointForName(name) != empty;
089    }
090
091    /**
092     * Check if the input is a known named entity in the base entity set.
093     *
094     * @param name the possible entity name (e.g. "lt" or "amp")
095     * @return true if a known named entity in the base set
096     * @see #isNamedEntity(String)
097     */
098    public static boolean isBaseNamedEntity(final String name) {
099        return base.codepointForName(name) != empty;
100    }
101
102    /**
103     * Get the character(s) represented by the named entity
104     *
105     * @param name entity (e.g. "lt" or "amp")
106     * @return the string value of the character(s) represented by this entity, or "" if not defined
107     */
108    public static String getByName(String name) {
109        String val = multipoints.get(name);
110        if (val != null)
111            return val;
112        int codepoint = extended.codepointForName(name);
113        if (codepoint != empty)
114            return new String(new int[]{codepoint}, 0, 1);
115        return emptyName;
116    }
117
118    public static int codepointsForName(final String name, final int[] codepoints) {
119        String val = multipoints.get(name);
120        if (val != null) {
121            codepoints[0] = val.codePointAt(0);
122            codepoints[1] = val.codePointAt(1);
123            return 2;
124        }
125        int codepoint = extended.codepointForName(name);
126        if (codepoint != empty) {
127            codepoints[0] = codepoint;
128            return 1;
129        }
130        return 0;
131    }
132
133    /**
134     HTML escape an input string. That is, {@code <} is returned as {@code &lt;}. The escaped string is suitable for use
135     both in attributes and in text data.
136     @param string the un-escaped string to escape
137     @param out the output settings to use. This configures the character set escaped against (that is, if a
138     character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML
139     settings.
140     @return the escaped string
141     */
142    public static String escape(String string, OutputSettings out) {
143        if (string == null)
144            return "";
145        StringBuilder accum = StringUtil.borrowBuilder();
146        try {
147            escape(accum, string, out, true, true, false, false, false); // for text and for attribute; preserve whitespaces
148        } catch (IOException e) {
149            throw new SerializationException(e); // doesn't happen
150        }
151        return StringUtil.releaseBuilder(accum);
152    }
153
154    /**
155     * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
156     * {@code &lt;}. The escaped string is suitable for use both in attributes and in text data.
157     *
158     * @param string the un-escaped string to escape
159     * @return the escaped string
160     * @see #escape(String, OutputSettings)
161     */
162    public static String escape(String string) {
163        if (DefaultOutput == null)
164            DefaultOutput = new OutputSettings();
165        return escape(string, DefaultOutput);
166    }
167    private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings
168
169    // this method does a lot, but other breakups cause rescanning and stringbuilder generations
170    static void escape(Appendable accum, String string, OutputSettings out,
171                       boolean forText, boolean forAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {
172
173        boolean lastWasWhite = false;
174        boolean reachedNonWhite = false;
175        final EscapeMode escapeMode = out.escapeMode();
176        final CharsetEncoder encoder = out.encoder();
177        final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
178        final int length = string.length();
179
180        int codePoint;
181        boolean skipped = false;
182        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
183            codePoint = string.codePointAt(offset);
184
185            if (normaliseWhite) {
186                if (StringUtil.isWhitespace(codePoint)) {
187                    if (stripLeadingWhite && !reachedNonWhite) continue;
188                    if (lastWasWhite) continue;
189                    if (trimTrailing) {
190                        skipped = true;
191                        continue;
192                    }
193                    accum.append(' ');
194                    lastWasWhite = true;
195                    continue;
196                } else {
197                    lastWasWhite = false;
198                    reachedNonWhite = true;
199                    if (skipped) {
200                        accum.append(' '); // wasn't the end, so need to place a normalized space
201                        skipped = false;
202                    }
203                }
204            }
205            // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
206            if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
207                final char c = (char) codePoint;
208                // html specific and required escapes:
209                switch (c) {
210                    case '&':
211                        accum.append("&amp;");
212                        break;
213                    case 0xA0:
214                        if (escapeMode != EscapeMode.xhtml)
215                            accum.append("&nbsp;");
216                        else
217                            accum.append("&#xa0;");
218                        break;
219                    case '<':
220                        // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
221                        if (forText || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
222                            accum.append("&lt;");
223                        else
224                            accum.append(c);
225                        break;
226                    case '>':
227                        if (forText)
228                            accum.append("&gt;");
229                        else
230                            accum.append(c);
231                        break;
232                    case '"':
233                        if (forAttribute)
234                            accum.append("&quot;");
235                        else
236                            accum.append(c);
237                        break;
238                    case '\'':
239                        if (forAttribute && forText) { // special case for the Entities.escape(string) method when we are maximally escaping. Otherwise, because we output attributes in "", there's no need to escape.
240                            if (escapeMode == EscapeMode.xhtml)
241                                accum.append("&#x27;");
242                            else
243                                accum.append("&apos;");
244                        }
245                        else
246                            accum.append(c);
247                        break;
248                    // we escape ascii control <x20 (other than tab, line-feed, carriage return)  for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
249                    case 0x9:
250                    case 0xA:
251                    case 0xD:
252                        accum.append(c);
253                        break;
254                    default:
255                        if (c < 0x20 || !canEncode(coreCharset, c, encoder))
256                            appendEncoded(accum, escapeMode, codePoint);
257                        else
258                            accum.append(c);
259                }
260            } else {
261                final String c = new String(Character.toChars(codePoint));
262                if (encoder.canEncode(c)) // uses fallback encoder for simplicity
263                    accum.append(c);
264                else
265                    appendEncoded(accum, escapeMode, codePoint);
266            }
267        }
268    }
269
270    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
271        final String name = escapeMode.nameForCodepoint(codePoint);
272        if (!emptyName.equals(name)) // ok for identity check
273            accum.append('&').append(name).append(';');
274        else
275            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
276    }
277
278    /**
279     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
280     *
281     * @param string the HTML string to un-escape
282     * @return the unescaped string
283     */
284    public static String unescape(String string) {
285        return unescape(string, false);
286    }
287
288    /**
289     * Unescape the input string.
290     *
291     * @param string to un-HTML-escape
292     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
293     * @return unescaped string
294     */
295    static String unescape(String string, boolean strict) {
296        return Parser.unescapeEntities(string, strict);
297    }
298
299    /*
300     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
301     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
302     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
303     * issues on Android if required.
304     *
305     * Benchmarks:     *
306     * OLD toHtml() impl v New (fastpath) in millis
307     * Wiki: 1895, 16
308     * CNN: 6378, 55
309     * Alterslash: 3013, 28
310     * Jsoup: 167, 2
311     */
312    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
313        // todo add more charset tests if impacted by Android's bad perf in canEncode
314        switch (charset) {
315            case ascii:
316                return c < 0x80;
317            case utf:
318                return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
319            default:
320                return fallback.canEncode(c);
321        }
322    }
323
324    enum CoreCharset {
325        ascii, utf, fallback;
326
327        static CoreCharset byName(final String name) {
328            if (name.equals("US-ASCII"))
329                return ascii;
330            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
331                return utf;
332            return fallback;
333        }
334    }
335
336    private static void load(EscapeMode e, String pointsData, int size) {
337        e.nameKeys = new String[size];
338        e.codeVals = new int[size];
339        e.codeKeys = new int[size];
340        e.nameVals = new String[size];
341
342        int i = 0;
343        CharacterReader reader = new CharacterReader(pointsData);
344        try {
345            while (!reader.isEmpty()) {
346                // NotNestedLessLess=10913,824;1887&
347
348                final String name = reader.consumeTo('=');
349                reader.advance();
350                final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
351                final char codeDelim = reader.current();
352                reader.advance();
353                final int cp2;
354                if (codeDelim == ',') {
355                    cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
356                    reader.advance();
357                } else {
358                    cp2 = empty;
359                }
360                final String indexS = reader.consumeTo('&');
361                final int index = Integer.parseInt(indexS, codepointRadix);
362                reader.advance();
363
364                e.nameKeys[i] = name;
365                e.codeVals[i] = cp1;
366                e.codeKeys[index] = cp1;
367                e.nameVals[index] = name;
368
369                if (cp2 != empty) {
370                    multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
371                }
372                i++;
373            }
374
375            Validate.isTrue(i == size, "Unexpected count of entities loaded");
376        } finally {
377            reader.close();
378        }
379    }
380}