001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SoftPool;
005import org.jsoup.internal.StringUtil;
006import org.jspecify.annotations.Nullable;
007
008import java.io.IOException;
009import java.io.UncheckedIOException;
010import java.io.Reader;
011import java.io.StringReader;
012import java.util.ArrayList;
013import java.util.Arrays;
014import java.util.Collections;
015import java.util.Locale;
016
017/**
018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
019 <p>If the underlying reader throws an IOException during any operation, the CharacterReader will throw an
020 {@link UncheckedIOException}. That won't happen with String / StringReader inputs.</p>
021 */
022public final class CharacterReader implements AutoCloseable {
023    static final char EOF = (char) -1;
024    private static final int MaxStringCacheLen = 12;
025    private static final int StringCacheSize = 512;
026    private String[] stringCache; // holds reused strings in this doc, to lessen garbage
027    private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations
028
029    static final int BufferSize = 1024 * 2;         // visible for testing
030    static final int RefillPoint = BufferSize / 2;  // when bufPos characters read, refill; visible for testing
031    private static final int RewindLimit = 1024;    // the maximum we can rewind. No HTML entities can be larger than this.
032
033    private Reader reader;      // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader
034    private char[] charBuf;     // character buffer we consume from; filled from Reader
035    private int bufPos;         // position in charBuf that's been consumed to
036    private int bufLength;      // the num of characters actually buffered in charBuf, <= charBuf.length
037    private int fillPoint = 0;  // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp
038    private int consumed;       // how many characters total have been consumed from this CharacterReader (less the current bufPos)
039    private int bufMark = -1;   // if not -1, the marked rewind position
040    private boolean readFully;  // if the underlying stream has been completely read, no value in further buffering
041
042    private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer
043
044    @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp()
045    private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)]
046
047    public CharacterReader(Reader input, int sz) {
048        this(input); // sz is no longer used
049    }
050
051    public CharacterReader(Reader input) {
052        Validate.notNull(input);
053        reader = input;
054        charBuf = BufferPool.borrow();
055        stringCache = StringPool.borrow();
056        bufferUp();
057    }
058
059    public CharacterReader(String input) {
060        this(new StringReader(input));
061    }
062
063    @Override
064    public void close() {
065        if (reader == null)
066            return;
067        try {
068            reader.close();
069        } catch (IOException ignored) {
070        } finally {
071            reader = null;
072            Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer
073            BufferPool.release(charBuf);
074            charBuf = null;
075            StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents
076            stringCache = null;
077        }
078    }
079
080    private void bufferUp() {
081        if (readFully || bufPos < fillPoint || bufMark != -1)
082            return;
083        doBufferUp(); // structured so bufferUp may become an intrinsic candidate
084    }
085
086    /**
087     Reads into the buffer. Will throw an UncheckedIOException if the underling reader throws an IOException.
088     @throws UncheckedIOException if the underlying reader throws an IOException
089     */
090    private void doBufferUp() {
091        /*
092        The flow:
093        - if read fully, or if bufPos < fillPoint, or if marked - do not fill.
094        - update readerPos (total amount consumed from this CharacterReader) += bufPos
095        - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount
096        - loop read the Reader until we fill charBuf. bufLength += read.
097        - readFully = true when read = -1
098         */
099        consumed += bufPos;
100        bufLength -= bufPos;
101        if (bufLength > 0)
102            System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength);
103        bufPos = 0;
104        while (bufLength < BufferSize) {
105            try {
106                int read = reader.read(charBuf, bufLength, charBuf.length - bufLength);
107                if (read == -1) {
108                    readFully = true;
109                    break;
110                }
111                bufLength += read;
112            } catch (IOException e) {
113                throw new UncheckedIOException(e);
114            }
115        }
116        fillPoint = Math.min(bufLength, RefillPoint);
117
118        scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking
119        lastIcSeq = null; // cache for last containsIgnoreCase(seq)
120    }
121
122    void mark() {
123        // make sure there is enough look ahead capacity
124        if (bufLength - bufPos < RewindLimit)
125            fillPoint = 0;
126
127        bufferUp();
128        bufMark = bufPos;
129    }
130
131    void unmark() {
132        bufMark = -1;
133    }
134
135    void rewindToMark() {
136        if (bufMark == -1)
137            throw new UncheckedIOException(new IOException("Mark invalid"));
138
139        bufPos = bufMark;
140        unmark();
141    }
142
143    /**
144     * Gets the position currently read to in the content. Starts at 0.
145     * @return current position
146     */
147    public int pos() {
148        return consumed + bufPos;
149    }
150
151    /** Tests if the buffer has been fully read. */
152    boolean readFully() {
153        return readFully;
154    }
155
156    /**
157     Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
158     legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of
159     use.
160
161     @param track set tracking on|off
162     @since 1.14.3
163     */
164    public void trackNewlines(boolean track) {
165        if (track && newlinePositions == null) {
166            newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count
167            scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp
168        }
169        else if (!track)
170            newlinePositions = null;
171    }
172
173    /**
174     Check if the tracking of newlines is enabled.
175     @return the current newline tracking state
176     @since 1.14.3
177     */
178    public boolean isTrackNewlines() {
179        return newlinePositions != null;
180    }
181
182    /**
183     Get the current line number (that the reader has consumed to). Starts at line #1.
184     @return the current line number, or 1 if line tracking is not enabled.
185     @since 1.14.3
186     @see #trackNewlines(boolean)
187     */
188    public int lineNumber() {
189        return lineNumber(pos());
190    }
191
192    int lineNumber(int pos) {
193        // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that
194        // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array
195        if (!isTrackNewlines())
196            return 1;
197
198        int i = lineNumIndex(pos);
199        if (i == -1)
200            return lineNumberOffset; // first line
201        return i + lineNumberOffset + 1;
202    }
203
204    /**
205     Get the current column number (that the reader has consumed to). Starts at column #1.
206     @return the current column number
207     @since 1.14.3
208     @see #trackNewlines(boolean)
209     */
210    public int columnNumber() {
211        return columnNumber(pos());
212    }
213
214    int columnNumber(int pos) {
215        if (!isTrackNewlines())
216            return pos + 1;
217
218        int i = lineNumIndex(pos);
219        if (i == -1)
220          return pos + 1;
221        return pos - newlinePositions.get(i) + 1;
222    }
223
224    /**
225     Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
226     number 5 and column number 10.
227     @return line:col position
228     @since 1.14.3
229     @see #trackNewlines(boolean)
230     */
231    String posLineCol() {
232        return lineNumber() + ":" + columnNumber();
233    }
234
235    private int lineNumIndex(int pos) {
236        if (!isTrackNewlines()) return 0;
237        int i = Collections.binarySearch(newlinePositions, pos);
238        if (i < -1) i = Math.abs(i) - 2;
239        return i;
240    }
241
242    /**
243     Scans the buffer for newline position, and tracks their location in newlinePositions.
244     */
245    private void scanBufferForNewlines() {
246        if (!isTrackNewlines())
247            return;
248
249        if (newlinePositions.size() > 0) {
250            // work out the line number that we have read up to (as we have likely scanned past this point)
251            int index = lineNumIndex(consumed);
252            if (index == -1) index = 0; // first line
253            int linePos = newlinePositions.get(index);
254            lineNumberOffset += index; // the num lines we've read up to
255            newlinePositions.clear();
256            newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer
257        }
258
259        for (int i = bufPos; i < bufLength; i++) {
260            if (charBuf[i] == '\n')
261                newlinePositions.add(1 + consumed + i);
262        }
263    }
264
265    /**
266     * Tests if all the content has been read.
267     * @return true if nothing left to read.
268     */
269    public boolean isEmpty() {
270        bufferUp();
271        return bufPos >= bufLength;
272    }
273
274    private boolean isEmptyNoBufferUp() {
275        return bufPos >= bufLength;
276    }
277
278    /**
279     * Get the char at the current position.
280     * @return char
281     */
282    public char current() {
283        bufferUp();
284        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
285    }
286
287    /**
288     Consume one character off the queue.
289     @return first character on queue, or EOF if the queue is empty.
290     */
291    public char consume() {
292        bufferUp();
293        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
294        bufPos++;
295        return val;
296    }
297
298    /**
299     Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
300     */
301    void unconsume() {
302        if (bufPos < 1)
303            throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it.
304
305        bufPos--;
306    }
307
308    /**
309     * Moves the current position by one.
310     */
311    public void advance() {
312        bufPos++;
313    }
314
315    /**
316     * Returns the number of characters between the current position and the next instance of the input char
317     * @param c scan target
318     * @return offset between current position and next instance of target. -1 if not found.
319     */
320    int nextIndexOf(char c) {
321        // doesn't handle scanning for surrogates
322        bufferUp();
323        for (int i = bufPos; i < bufLength; i++) {
324            if (c == charBuf[i])
325                return i - bufPos;
326        }
327        return -1;
328    }
329
330    /**
331     * Returns the number of characters between the current position and the next instance of the input sequence
332     *
333     * @param seq scan target
334     * @return offset between current position and next instance of target. -1 if not found.
335     */
336    int nextIndexOf(CharSequence seq) {
337        bufferUp();
338        // doesn't handle scanning for surrogates
339        char startChar = seq.charAt(0);
340        for (int offset = bufPos; offset < bufLength; offset++) {
341            // scan to first instance of startchar:
342            if (startChar != charBuf[offset])
343                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
344            int i = offset + 1;
345            int last = i + seq.length()-1;
346            if (offset < bufLength && last <= bufLength) {
347                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
348                if (i == last) // found full sequence
349                    return offset - bufPos;
350            }
351        }
352        return -1;
353    }
354
355    /**
356     * Reads characters up to the specific char.
357     * @param c the delimiter
358     * @return the chars read
359     */
360    public String consumeTo(char c) {
361        int offset = nextIndexOf(c);
362        if (offset != -1) {
363            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
364            bufPos += offset;
365            return consumed;
366        } else {
367            return consumeToEnd();
368        }
369    }
370
371    /**
372     Reads the characters up to (but not including) the specified case-sensitive string.
373     <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the
374     length of the sequence, such that this call may be repeated.
375     @param seq the delimiter
376     @return the chars read
377     */
378    public String consumeTo(String seq) {
379        int offset = nextIndexOf(seq);
380        if (offset != -1) {
381            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
382            bufPos += offset;
383            return consumed;
384        } else if (bufLength - bufPos < seq.length()) {
385            // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
386            return consumeToEnd();
387        } else {
388            // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
389            // unread in case they contain the beginning of the search string
390            int endPos = bufLength - seq.length() + 1;
391            String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
392            bufPos = endPos;
393            return consumed;
394        }
395    }
396
397    /**
398     Read characters while the input predicate returns true.
399     @return characters read
400     */
401    String consumeMatching(CharPredicate func) {
402        return consumeMatching(func, -1);
403    }
404
405    /**
406     Read characters while the input predicate returns true, up to a maximum length.
407     @param func predicate to test
408     @param maxLength maximum length to read. -1 indicates no maximum
409     @return characters read
410     */
411    String consumeMatching(CharPredicate func, int maxLength) {
412        bufferUp();
413        int pos = bufPos;
414        final int start = pos;
415        final int remaining = bufLength;
416        final char[] val = charBuf;
417
418        while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) {
419            pos++;
420        }
421
422        bufPos = pos;
423        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
424    }
425
426    /**
427     * Read characters until the first of any delimiters is found.
428     * @param chars delimiters to scan for
429     * @return characters read up to the matched delimiter.
430     */
431    public String consumeToAny(final char... chars) {
432        return consumeMatching(c -> { // seeks until we see one of the terminating chars
433            for (char seek : chars)
434                if (c == seek) return false;
435            return true;
436        });
437    }
438
439    String consumeToAnySorted(final char... chars) {
440        return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit
441    }
442
443    String consumeData() {
444        // consumes until &, <, null
445        return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar);
446    }
447
448    String consumeAttributeQuoted(final boolean single) {
449        // null, " or ', &
450        return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"'));
451    }
452
453    String consumeRawData() {
454        // <, null
455        return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar);
456    }
457
458    String consumeTagName() {
459        // '\t', '\n', '\r', '\f', ' ', '/', '>'
460        // NOTE: out of spec; does not stop and append on nullChar but eats
461        return consumeMatching(c -> {
462            switch (c) {
463                case '\t':
464                case '\n':
465                case '\r':
466                case '\f':
467                case ' ':
468                case '/':
469                case '>':
470                    return false;
471            }
472            return true;
473        });
474    }
475
476    String consumeToEnd() {
477        bufferUp();
478        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
479        bufPos = bufLength;
480        return data;
481    }
482
483    String consumeLetterSequence() {
484        return consumeMatching(Character::isLetter);
485    }
486
487    String consumeLetterThenDigitSequence() {
488        bufferUp();
489        int start = bufPos;
490        while (bufPos < bufLength) {
491            if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++;
492            else break;
493        }
494        while (!isEmptyNoBufferUp()) {
495            if (StringUtil.isDigit(charBuf[bufPos])) bufPos++;
496            else break;
497        }
498
499        return cacheString(charBuf, stringCache, start, bufPos - start);
500    }
501
502    String consumeHexSequence() {
503        return consumeMatching(StringUtil::isHexDigit);
504    }
505
506    String consumeDigitSequence() {
507        return consumeMatching(c -> c >= '0' && c <= '9');
508    }
509
510    boolean matches(char c) {
511        return !isEmpty() && charBuf[bufPos] == c;
512    }
513
514    boolean matches(String seq) {
515        bufferUp();
516        int scanLength = seq.length();
517        if (scanLength > bufLength - bufPos)
518            return false;
519
520        for (int offset = 0; offset < scanLength; offset++)
521            if (seq.charAt(offset) != charBuf[bufPos +offset])
522                return false;
523        return true;
524    }
525
526    boolean matchesIgnoreCase(String seq) {
527        bufferUp();
528        int scanLength = seq.length();
529        if (scanLength > bufLength - bufPos)
530            return false;
531
532        for (int offset = 0; offset < scanLength; offset++) {
533            char scan = seq.charAt(offset);
534            char target = charBuf[bufPos + offset];
535            if (scan == target) continue;
536
537            scan = Character.toUpperCase(scan);
538            target = Character.toUpperCase(target);
539            if (scan != target) return false;
540        }
541        return true;
542    }
543
544    /**
545     Tests if the next character in the queue matches any of the characters in the sequence, case sensitively.
546     @param seq list of characters to check for
547     @return true if any matched, false if none did
548     */
549    boolean matchesAny(char... seq) {
550        if (isEmpty())
551            return false;
552
553        bufferUp();
554        char c = charBuf[bufPos];
555        for (char seek : seq) {
556            if (seek == c)
557                return true;
558        }
559        return false;
560    }
561
562    boolean matchesAnySorted(char[] seq) {
563        bufferUp();
564        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
565    }
566
567    /**
568     Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
569     @return if it matches or not
570     */
571    boolean matchesAsciiAlpha() {
572        if (isEmpty()) return false;
573        return StringUtil.isAsciiLetter(charBuf[bufPos]);
574    }
575
576    boolean matchesDigit() {
577        if (isEmpty()) return false;
578        return StringUtil.isDigit(charBuf[bufPos]);
579    }
580
581    boolean matchConsume(String seq) {
582        bufferUp();
583        if (matches(seq)) {
584            bufPos += seq.length();
585            return true;
586        } else {
587            return false;
588        }
589    }
590
591    boolean matchConsumeIgnoreCase(String seq) {
592        if (matchesIgnoreCase(seq)) {
593            bufPos += seq.length();
594            return true;
595        } else {
596            return false;
597        }
598    }
599
600    // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
601    // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
602    // looking for the </title>. Resets in bufferUp()
603    @Nullable private String lastIcSeq; // scan cache
604    private int lastIcIndex; // nearest found indexOf
605
606    /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */
607    boolean containsIgnoreCase(String seq) {
608        if (seq.equals(lastIcSeq)) {
609            if (lastIcIndex == -1) return false;
610            if (lastIcIndex >= bufPos) return true;
611        }
612        lastIcSeq = seq;
613
614        String loScan = seq.toLowerCase(Locale.ENGLISH);
615        int lo = nextIndexOf(loScan);
616        if (lo > -1) {
617            lastIcIndex = bufPos + lo; return true;
618        }
619
620        String hiScan = seq.toUpperCase(Locale.ENGLISH);
621        int hi = nextIndexOf(hiScan);
622        boolean found = hi > -1;
623        lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains
624        return found;
625    }
626
627    @Override
628    public String toString() {
629        if (bufLength - bufPos < 0) return "";
630        return new String(charBuf, bufPos, bufLength - bufPos);
631    }
632
633    /**
634     * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks.
635     * <p />
636     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
637     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
638     * some more duplicates.
639     */
640    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
641        if (count > MaxStringCacheLen) // don't cache strings that are too big
642            return new String(charBuf, start, count);
643        if (count < 1)
644            return "";
645
646        // calculate hash:
647        int hash = 0;
648        int end = count + start;
649        for (int i = start; i < end; i++) {
650            hash = 31 * hash + charBuf[i];
651        }
652
653        // get from cache
654        final int index = hash & StringCacheSize - 1;
655        String cached = stringCache[index];
656
657        if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit
658            return cached;
659        else {
660            cached = new String(charBuf, start, count);
661            stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next
662        }
663
664        return cached;
665    }
666
667    /**
668     * Check if the value of the provided range equals the string.
669     */
670    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
671        if (count == cached.length()) {
672            int i = start;
673            int j = 0;
674            while (count-- != 0) {
675                if (charBuf[i++] != cached.charAt(j++))
676                    return false;
677            }
678            return true;
679        }
680        return false;
681    }
682
683    // just used for testing
684    boolean rangeEquals(final int start, final int count, final String cached) {
685        return rangeEquals(charBuf, start, count, cached);
686    }
687
688    @FunctionalInterface
689    interface CharPredicate {
690        boolean test(char c);
691    }
692}