001package org.jsoup.parser;
002
003import org.jsoup.UncheckedIOException;
004import org.jsoup.helper.Validate;
005import org.jsoup.internal.SoftPool;
006import org.jspecify.annotations.Nullable;
007
008import java.io.IOException;
009import java.io.Reader;
010import java.io.StringReader;
011import java.util.ArrayList;
012import java.util.Arrays;
013import java.util.Collections;
014import java.util.Locale;
015
016/**
017 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
018 */
019public final class CharacterReader {
020    static final char EOF = (char) -1;
021    private static final int MaxStringCacheLen = 12;
022    private static final int StringCacheSize = 512;
023    private String[] stringCache; // holds reused strings in this doc, to lessen garbage
024    private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations
025
026    static final int BufferSize = 1024 * 2;         // visible for testing
027    static final int RefillPoint = BufferSize / 2;  // when bufPos characters read, refill; visible for testing
028    private static final int RewindLimit = 1024;    // the maximum we can rewind. No HTML entities can be larger than this.
029
030    private Reader reader;      // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader
031    private char[] charBuf;     // character buffer we consume from; filled from Reader
032    private int bufPos;         // position in charBuf that's been consumed to
033    private int bufLength;      // the num of characters actually buffered in charBuf, <= charBuf.length
034    private int fillPoint = 0;  // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp
035    private int consumed;       // how many characters total have been consumed from this CharacterReader (less the current bufPos)
036    private int bufMark = -1;   // if not -1, the marked rewind position
037    private boolean readFully;  // if the underlying stream has been completely read, no value in further buffering
038
039    private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer
040
041    @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp()
042    private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)]
043
044    public CharacterReader(Reader input, int sz) {
045        this(input); // sz is no longer used
046    }
047
048    public CharacterReader(Reader input) {
049        Validate.notNull(input);
050        reader = input;
051        charBuf = BufferPool.borrow();
052        stringCache = StringPool.borrow();
053        bufferUp();
054    }
055
056    public CharacterReader(String input) {
057        this(new StringReader(input));
058    }
059
060    public void close() {
061        if (reader == null)
062            return;
063        try {
064            reader.close();
065        } catch (IOException ignored) {
066        } finally {
067            reader = null;
068            Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer
069            BufferPool.release(charBuf);
070            charBuf = null;
071            StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents
072            stringCache = null;
073        }
074    }
075
076    private void bufferUp() {
077        if (readFully || bufPos < fillPoint || bufMark != -1)
078            return;
079        doBufferUp(); // structured so bufferUp may become an intrinsic candidate
080    }
081
082    private void doBufferUp() {
083        /*
084        The flow:
085        - if read fully, or if bufPos < fillPoint, or if marked - do not fill.
086        - update readerPos (total amount consumed from this CharacterReader) += bufPos
087        - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount
088        - loop read the Reader until we fill charBuf. bufLength += read.
089        - readFully = true when read = -1
090         */
091        consumed += bufPos;
092        bufLength -= bufPos;
093        if (bufLength > 0)
094            System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength);
095        bufPos = 0;
096        while (bufLength < BufferSize) {
097            try {
098                int read = reader.read(charBuf, bufLength, charBuf.length - bufLength);
099                if (read == -1) {
100                    readFully = true;
101                    break;
102                }
103                bufLength += read;
104            } catch (IOException e) {
105                throw new UncheckedIOException(e);
106            }
107        }
108        fillPoint = Math.min(bufLength, RefillPoint);
109
110        scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking
111        lastIcSeq = null; // cache for last containsIgnoreCase(seq)
112    }
113
114    void mark() {
115        // make sure there is enough look ahead capacity
116        if (bufLength - bufPos < RewindLimit)
117            fillPoint = 0;
118
119        bufferUp();
120        bufMark = bufPos;
121    }
122
123    void unmark() {
124        bufMark = -1;
125    }
126
127    void rewindToMark() {
128        if (bufMark == -1)
129            throw new UncheckedIOException(new IOException("Mark invalid"));
130
131        bufPos = bufMark;
132        unmark();
133    }
134
135    /**
136     * Gets the position currently read to in the content. Starts at 0.
137     * @return current position
138     */
139    public int pos() {
140        return consumed + bufPos;
141    }
142
143    /** Tests if the buffer has been fully read. */
144    boolean readFully() {
145        return readFully;
146    }
147
148    /**
149     Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
150     legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of
151     use.
152
153     @param track set tracking on|off
154     @since 1.14.3
155     */
156    public void trackNewlines(boolean track) {
157        if (track && newlinePositions == null) {
158            newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count
159            scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp
160        }
161        else if (!track)
162            newlinePositions = null;
163    }
164
165    /**
166     Check if the tracking of newlines is enabled.
167     @return the current newline tracking state
168     @since 1.14.3
169     */
170    public boolean isTrackNewlines() {
171        return newlinePositions != null;
172    }
173
174    /**
175     Get the current line number (that the reader has consumed to). Starts at line #1.
176     @return the current line number, or 1 if line tracking is not enabled.
177     @since 1.14.3
178     @see #trackNewlines(boolean)
179     */
180    public int lineNumber() {
181        return lineNumber(pos());
182    }
183
184    int lineNumber(int pos) {
185        // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that
186        // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array
187        if (!isTrackNewlines())
188            return 1;
189
190        int i = lineNumIndex(pos);
191        if (i == -1)
192            return lineNumberOffset; // first line
193        return i + lineNumberOffset + 1;
194    }
195
196    /**
197     Get the current column number (that the reader has consumed to). Starts at column #1.
198     @return the current column number
199     @since 1.14.3
200     @see #trackNewlines(boolean)
201     */
202    public int columnNumber() {
203        return columnNumber(pos());
204    }
205
206    int columnNumber(int pos) {
207        if (!isTrackNewlines())
208            return pos + 1;
209
210        int i = lineNumIndex(pos);
211        if (i == -1)
212          return pos + 1;
213        return pos - newlinePositions.get(i) + 1;
214    }
215
216    /**
217     Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
218     number 5 and column number 10.
219     @return line:col position
220     @since 1.14.3
221     @see #trackNewlines(boolean)
222     */
223    String posLineCol() {
224        return lineNumber() + ":" + columnNumber();
225    }
226
227    private int lineNumIndex(int pos) {
228        if (!isTrackNewlines()) return 0;
229        int i = Collections.binarySearch(newlinePositions, pos);
230        if (i < -1) i = Math.abs(i) - 2;
231        return i;
232    }
233
234    /**
235     Scans the buffer for newline position, and tracks their location in newlinePositions.
236     */
237    private void scanBufferForNewlines() {
238        if (!isTrackNewlines())
239            return;
240
241        if (newlinePositions.size() > 0) {
242            // work out the line number that we have read up to (as we have likely scanned past this point)
243            int index = lineNumIndex(consumed);
244            if (index == -1) index = 0; // first line
245            int linePos = newlinePositions.get(index);
246            lineNumberOffset += index; // the num lines we've read up to
247            newlinePositions.clear();
248            newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer
249        }
250
251        for (int i = bufPos; i < bufLength; i++) {
252            if (charBuf[i] == '\n')
253                newlinePositions.add(1 + consumed + i);
254        }
255    }
256
257    /**
258     * Tests if all the content has been read.
259     * @return true if nothing left to read.
260     */
261    public boolean isEmpty() {
262        bufferUp();
263        return bufPos >= bufLength;
264    }
265
266    private boolean isEmptyNoBufferUp() {
267        return bufPos >= bufLength;
268    }
269
270    /**
271     * Get the char at the current position.
272     * @return char
273     */
274    public char current() {
275        bufferUp();
276        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
277    }
278
279    char consume() {
280        bufferUp();
281        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
282        bufPos++;
283        return val;
284    }
285
286    /**
287     Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
288     */
289    void unconsume() {
290        if (bufPos < 1)
291            throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it.
292
293        bufPos--;
294    }
295
296    /**
297     * Moves the current position by one.
298     */
299    public void advance() {
300        bufPos++;
301    }
302
303    /**
304     * Returns the number of characters between the current position and the next instance of the input char
305     * @param c scan target
306     * @return offset between current position and next instance of target. -1 if not found.
307     */
308    int nextIndexOf(char c) {
309        // doesn't handle scanning for surrogates
310        bufferUp();
311        for (int i = bufPos; i < bufLength; i++) {
312            if (c == charBuf[i])
313                return i - bufPos;
314        }
315        return -1;
316    }
317
318    /**
319     * Returns the number of characters between the current position and the next instance of the input sequence
320     *
321     * @param seq scan target
322     * @return offset between current position and next instance of target. -1 if not found.
323     */
324    int nextIndexOf(CharSequence seq) {
325        bufferUp();
326        // doesn't handle scanning for surrogates
327        char startChar = seq.charAt(0);
328        for (int offset = bufPos; offset < bufLength; offset++) {
329            // scan to first instance of startchar:
330            if (startChar != charBuf[offset])
331                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
332            int i = offset + 1;
333            int last = i + seq.length()-1;
334            if (offset < bufLength && last <= bufLength) {
335                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
336                if (i == last) // found full sequence
337                    return offset - bufPos;
338            }
339        }
340        return -1;
341    }
342
343    /**
344     * Reads characters up to the specific char.
345     * @param c the delimiter
346     * @return the chars read
347     */
348    public String consumeTo(char c) {
349        int offset = nextIndexOf(c);
350        if (offset != -1) {
351            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
352            bufPos += offset;
353            return consumed;
354        } else {
355            return consumeToEnd();
356        }
357    }
358
359    String consumeTo(String seq) {
360        int offset = nextIndexOf(seq);
361        if (offset != -1) {
362            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
363            bufPos += offset;
364            return consumed;
365        } else if (bufLength - bufPos < seq.length()) {
366            // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
367            return consumeToEnd();
368        } else {
369            // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
370            // unread in case they contain the beginning of the search string
371            int endPos = bufLength - seq.length() + 1;
372            String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
373            bufPos = endPos;
374            return consumed;
375        }
376    }
377
378    /**
379     * Read characters until the first of any delimiters is found.
380     * @param chars delimiters to scan for
381     * @return characters read up to the matched delimiter.
382     */
383    public String consumeToAny(final char... chars) {
384        bufferUp();
385        int pos = bufPos;
386        final int start = pos;
387        final int remaining = bufLength;
388        final char[] val = charBuf;
389        final int charLen = chars.length;
390        int i;
391
392        OUTER: while (pos < remaining) {
393            for (i = 0; i < charLen; i++) {
394                if (val[pos] == chars[i])
395                    break OUTER;
396            }
397            pos++;
398        }
399
400        bufPos = pos;
401        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
402    }
403
404    String consumeToAnySorted(final char... chars) {
405        bufferUp();
406        int pos = bufPos;
407        final int start = pos;
408        final int remaining = bufLength;
409        final char[] val = charBuf;
410
411        while (pos < remaining) {
412            if (Arrays.binarySearch(chars, val[pos]) >= 0)
413                break;
414            pos++;
415        }
416        bufPos = pos;
417        return bufPos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
418    }
419
420    String consumeData() {
421        // &, <, null
422        //bufferUp(); // no need to bufferUp, just called consume()
423        int pos = bufPos;
424        final int start = pos;
425        final int remaining = bufLength;
426        final char[] val = charBuf;
427
428        OUTER: while (pos < remaining) {
429            switch (val[pos]) {
430                case '&':
431                case '<':
432                case TokeniserState.nullChar:
433                    break OUTER;
434                default:
435                    pos++;
436            }
437        }
438        bufPos = pos;
439        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
440    }
441
442    String consumeAttributeQuoted(final boolean single) {
443        // null, " or ', &
444        //bufferUp(); // no need to bufferUp, just called consume()
445        int pos = bufPos;
446        final int start = pos;
447        final int remaining = bufLength;
448        final char[] val = charBuf;
449
450        OUTER: while (pos < remaining) {
451            switch (val[pos]) {
452                case '&':
453                case TokeniserState.nullChar:
454                    break OUTER;
455                case '\'':
456                    if (single) break OUTER;
457                    break;
458                case '"':
459                    if (!single) break OUTER;
460                    break;
461            }
462            pos++;
463        }
464        bufPos = pos;
465        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
466    }
467
468
469    String consumeRawData() {
470        // <, null
471        //bufferUp(); // no need to bufferUp, just called consume()
472        int pos = bufPos;
473        final int start = pos;
474        final int remaining = bufLength;
475        final char[] val = charBuf;
476
477        OUTER: while (pos < remaining) {
478            switch (val[pos]) {
479                case '<':
480                case TokeniserState.nullChar:
481                    break OUTER;
482                default:
483                    pos++;
484            }
485        }
486        bufPos = pos;
487        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
488    }
489
490    String consumeTagName() {
491        // '\t', '\n', '\r', '\f', ' ', '/', '>'
492        // NOTE: out of spec; does not stop and append on nullChar but eats
493        bufferUp();
494        int pos = bufPos;
495        final int start = pos;
496        final int remaining = bufLength;
497        final char[] val = charBuf;
498
499        OUTER: while (pos < remaining) {
500            switch (val[pos]) {
501                case '\t':
502                case '\n':
503                case '\r':
504                case '\f':
505                case ' ':
506                case '/':
507                case '>':
508                    break OUTER;
509            }
510            pos++;
511        }
512
513        bufPos = pos;
514        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
515    }
516
517    String consumeToEnd() {
518        bufferUp();
519        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
520        bufPos = bufLength;
521        return data;
522    }
523
524    String consumeLetterSequence() {
525        bufferUp();
526        int start = bufPos;
527        while (bufPos < bufLength) {
528            char c = charBuf[bufPos];
529            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
530                bufPos++;
531            else
532                break;
533        }
534
535        return cacheString(charBuf, stringCache, start, bufPos - start);
536    }
537
538    String consumeLetterThenDigitSequence() {
539        bufferUp();
540        int start = bufPos;
541        while (bufPos < bufLength) {
542            char c = charBuf[bufPos];
543            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
544                bufPos++;
545            else
546                break;
547        }
548        while (!isEmptyNoBufferUp()) {
549            char c = charBuf[bufPos];
550            if (c >= '0' && c <= '9')
551                bufPos++;
552            else
553                break;
554        }
555
556        return cacheString(charBuf, stringCache, start, bufPos - start);
557    }
558
559    String consumeHexSequence() {
560        bufferUp();
561        int start = bufPos;
562        while (bufPos < bufLength) {
563            char c = charBuf[bufPos];
564            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
565                bufPos++;
566            else
567                break;
568        }
569        return cacheString(charBuf, stringCache, start, bufPos - start);
570    }
571
572    String consumeDigitSequence() {
573        bufferUp();
574        int start = bufPos;
575        while (bufPos < bufLength) {
576            char c = charBuf[bufPos];
577            if (c >= '0' && c <= '9')
578                bufPos++;
579            else
580                break;
581        }
582        return cacheString(charBuf, stringCache, start, bufPos - start);
583    }
584
585    boolean matches(char c) {
586        return !isEmpty() && charBuf[bufPos] == c;
587
588    }
589
590    boolean matches(String seq) {
591        bufferUp();
592        int scanLength = seq.length();
593        if (scanLength > bufLength - bufPos)
594            return false;
595
596        for (int offset = 0; offset < scanLength; offset++)
597            if (seq.charAt(offset) != charBuf[bufPos +offset])
598                return false;
599        return true;
600    }
601
602    boolean matchesIgnoreCase(String seq) {
603        bufferUp();
604        int scanLength = seq.length();
605        if (scanLength > bufLength - bufPos)
606            return false;
607
608        for (int offset = 0; offset < scanLength; offset++) {
609            char upScan = Character.toUpperCase(seq.charAt(offset));
610            char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
611            if (upScan != upTarget)
612                return false;
613        }
614        return true;
615    }
616
617    boolean matchesAny(char... seq) {
618        if (isEmpty())
619            return false;
620
621        bufferUp();
622        char c = charBuf[bufPos];
623        for (char seek : seq) {
624            if (seek == c)
625                return true;
626        }
627        return false;
628    }
629
630    boolean matchesAnySorted(char[] seq) {
631        bufferUp();
632        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
633    }
634
635    boolean matchesLetter() {
636        if (isEmpty())
637            return false;
638        char c = charBuf[bufPos];
639        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
640    }
641
642    /**
643     Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
644     @return if it matches or not
645     */
646    boolean matchesAsciiAlpha() {
647        if (isEmpty())
648            return false;
649        char c = charBuf[bufPos];
650        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
651    }
652
653    boolean matchesDigit() {
654        if (isEmpty())
655            return false;
656        char c = charBuf[bufPos];
657        return (c >= '0' && c <= '9');
658    }
659
660    boolean matchConsume(String seq) {
661        bufferUp();
662        if (matches(seq)) {
663            bufPos += seq.length();
664            return true;
665        } else {
666            return false;
667        }
668    }
669
670    boolean matchConsumeIgnoreCase(String seq) {
671        if (matchesIgnoreCase(seq)) {
672            bufPos += seq.length();
673            return true;
674        } else {
675            return false;
676        }
677    }
678
679    // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
680    // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
681    // looking for the </title>. Resets in bufferUp()
682    @Nullable private String lastIcSeq; // scan cache
683    private int lastIcIndex; // nearest found indexOf
684
685    /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */
686    boolean containsIgnoreCase(String seq) {
687        if (seq.equals(lastIcSeq)) {
688            if (lastIcIndex == -1) return false;
689            if (lastIcIndex >= bufPos) return true;
690        }
691        lastIcSeq = seq;
692
693        String loScan = seq.toLowerCase(Locale.ENGLISH);
694        int lo = nextIndexOf(loScan);
695        if (lo > -1) {
696            lastIcIndex = bufPos + lo; return true;
697        }
698
699        String hiScan = seq.toUpperCase(Locale.ENGLISH);
700        int hi = nextIndexOf(hiScan);
701        boolean found = hi > -1;
702        lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains
703        return found;
704    }
705
706    @Override
707    public String toString() {
708        if (bufLength - bufPos < 0)
709            return "";
710        return new String(charBuf, bufPos, bufLength - bufPos);
711    }
712
713    /**
714     * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks.
715     * <p />
716     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
717     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
718     * some more duplicates.
719     */
720    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
721        if (count > MaxStringCacheLen) // don't cache strings that are too big
722            return new String(charBuf, start, count);
723        if (count < 1)
724            return "";
725
726        // calculate hash:
727        int hash = 0;
728        int end = count + start;
729        for (int i = start; i < end; i++) {
730            hash = 31 * hash + charBuf[i];
731        }
732
733        // get from cache
734        final int index = hash & StringCacheSize - 1;
735        String cached = stringCache[index];
736
737        if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit
738            return cached;
739        else {
740            cached = new String(charBuf, start, count);
741            stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next
742        }
743
744        return cached;
745    }
746
747    /**
748     * Check if the value of the provided range equals the string.
749     */
750    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
751        if (count == cached.length()) {
752            int i = start;
753            int j = 0;
754            while (count-- != 0) {
755                if (charBuf[i++] != cached.charAt(j++))
756                    return false;
757            }
758            return true;
759        }
760        return false;
761    }
762
763    // just used for testing
764    boolean rangeEquals(final int start, final int count, final String cached) {
765        return rangeEquals(charBuf, start, count, cached);
766    }
767}