001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.internal.SoftPool;
005import org.jsoup.internal.StringUtil;
006import org.jspecify.annotations.Nullable;
007
008import java.io.IOException;
009import java.io.UncheckedIOException;
010import java.io.Reader;
011import java.io.StringReader;
012import java.util.ArrayList;
013import java.util.Arrays;
014import java.util.Collections;
015import java.util.Locale;
016
017/**
018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
019 */
020public final class CharacterReader {
021    static final char EOF = (char) -1;
022    private static final int MaxStringCacheLen = 12;
023    private static final int StringCacheSize = 512;
024    private String[] stringCache; // holds reused strings in this doc, to lessen garbage
025    private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations
026
027    static final int BufferSize = 1024 * 2;         // visible for testing
028    static final int RefillPoint = BufferSize / 2;  // when bufPos characters read, refill; visible for testing
029    private static final int RewindLimit = 1024;    // the maximum we can rewind. No HTML entities can be larger than this.
030
031    private Reader reader;      // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader
032    private char[] charBuf;     // character buffer we consume from; filled from Reader
033    private int bufPos;         // position in charBuf that's been consumed to
034    private int bufLength;      // the num of characters actually buffered in charBuf, <= charBuf.length
035    private int fillPoint = 0;  // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp
036    private int consumed;       // how many characters total have been consumed from this CharacterReader (less the current bufPos)
037    private int bufMark = -1;   // if not -1, the marked rewind position
038    private boolean readFully;  // if the underlying stream has been completely read, no value in further buffering
039
040    private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer
041
042    @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp()
043    private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)]
044
045    public CharacterReader(Reader input, int sz) {
046        this(input); // sz is no longer used
047    }
048
049    public CharacterReader(Reader input) {
050        Validate.notNull(input);
051        reader = input;
052        charBuf = BufferPool.borrow();
053        stringCache = StringPool.borrow();
054        bufferUp();
055    }
056
057    public CharacterReader(String input) {
058        this(new StringReader(input));
059    }
060
061    public void close() {
062        if (reader == null)
063            return;
064        try {
065            reader.close();
066        } catch (IOException ignored) {
067        } finally {
068            reader = null;
069            Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer
070            BufferPool.release(charBuf);
071            charBuf = null;
072            StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents
073            stringCache = null;
074        }
075    }
076
077    private void bufferUp() {
078        if (readFully || bufPos < fillPoint || bufMark != -1)
079            return;
080        doBufferUp(); // structured so bufferUp may become an intrinsic candidate
081    }
082
083    private void doBufferUp() {
084        /*
085        The flow:
086        - if read fully, or if bufPos < fillPoint, or if marked - do not fill.
087        - update readerPos (total amount consumed from this CharacterReader) += bufPos
088        - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount
089        - loop read the Reader until we fill charBuf. bufLength += read.
090        - readFully = true when read = -1
091         */
092        consumed += bufPos;
093        bufLength -= bufPos;
094        if (bufLength > 0)
095            System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength);
096        bufPos = 0;
097        while (bufLength < BufferSize) {
098            try {
099                int read = reader.read(charBuf, bufLength, charBuf.length - bufLength);
100                if (read == -1) {
101                    readFully = true;
102                    break;
103                }
104                bufLength += read;
105            } catch (IOException e) {
106                throw new UncheckedIOException(e);
107            }
108        }
109        fillPoint = Math.min(bufLength, RefillPoint);
110
111        scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking
112        lastIcSeq = null; // cache for last containsIgnoreCase(seq)
113    }
114
115    void mark() {
116        // make sure there is enough look ahead capacity
117        if (bufLength - bufPos < RewindLimit)
118            fillPoint = 0;
119
120        bufferUp();
121        bufMark = bufPos;
122    }
123
124    void unmark() {
125        bufMark = -1;
126    }
127
128    void rewindToMark() {
129        if (bufMark == -1)
130            throw new UncheckedIOException(new IOException("Mark invalid"));
131
132        bufPos = bufMark;
133        unmark();
134    }
135
136    /**
137     * Gets the position currently read to in the content. Starts at 0.
138     * @return current position
139     */
140    public int pos() {
141        return consumed + bufPos;
142    }
143
144    /** Tests if the buffer has been fully read. */
145    boolean readFully() {
146        return readFully;
147    }
148
149    /**
150     Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
151     legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of
152     use.
153
154     @param track set tracking on|off
155     @since 1.14.3
156     */
157    public void trackNewlines(boolean track) {
158        if (track && newlinePositions == null) {
159            newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count
160            scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp
161        }
162        else if (!track)
163            newlinePositions = null;
164    }
165
166    /**
167     Check if the tracking of newlines is enabled.
168     @return the current newline tracking state
169     @since 1.14.3
170     */
171    public boolean isTrackNewlines() {
172        return newlinePositions != null;
173    }
174
175    /**
176     Get the current line number (that the reader has consumed to). Starts at line #1.
177     @return the current line number, or 1 if line tracking is not enabled.
178     @since 1.14.3
179     @see #trackNewlines(boolean)
180     */
181    public int lineNumber() {
182        return lineNumber(pos());
183    }
184
185    int lineNumber(int pos) {
186        // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that
187        // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array
188        if (!isTrackNewlines())
189            return 1;
190
191        int i = lineNumIndex(pos);
192        if (i == -1)
193            return lineNumberOffset; // first line
194        return i + lineNumberOffset + 1;
195    }
196
197    /**
198     Get the current column number (that the reader has consumed to). Starts at column #1.
199     @return the current column number
200     @since 1.14.3
201     @see #trackNewlines(boolean)
202     */
203    public int columnNumber() {
204        return columnNumber(pos());
205    }
206
207    int columnNumber(int pos) {
208        if (!isTrackNewlines())
209            return pos + 1;
210
211        int i = lineNumIndex(pos);
212        if (i == -1)
213          return pos + 1;
214        return pos - newlinePositions.get(i) + 1;
215    }
216
217    /**
218     Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
219     number 5 and column number 10.
220     @return line:col position
221     @since 1.14.3
222     @see #trackNewlines(boolean)
223     */
224    String posLineCol() {
225        return lineNumber() + ":" + columnNumber();
226    }
227
228    private int lineNumIndex(int pos) {
229        if (!isTrackNewlines()) return 0;
230        int i = Collections.binarySearch(newlinePositions, pos);
231        if (i < -1) i = Math.abs(i) - 2;
232        return i;
233    }
234
235    /**
236     Scans the buffer for newline position, and tracks their location in newlinePositions.
237     */
238    private void scanBufferForNewlines() {
239        if (!isTrackNewlines())
240            return;
241
242        if (newlinePositions.size() > 0) {
243            // work out the line number that we have read up to (as we have likely scanned past this point)
244            int index = lineNumIndex(consumed);
245            if (index == -1) index = 0; // first line
246            int linePos = newlinePositions.get(index);
247            lineNumberOffset += index; // the num lines we've read up to
248            newlinePositions.clear();
249            newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer
250        }
251
252        for (int i = bufPos; i < bufLength; i++) {
253            if (charBuf[i] == '\n')
254                newlinePositions.add(1 + consumed + i);
255        }
256    }
257
258    /**
259     * Tests if all the content has been read.
260     * @return true if nothing left to read.
261     */
262    public boolean isEmpty() {
263        bufferUp();
264        return bufPos >= bufLength;
265    }
266
267    private boolean isEmptyNoBufferUp() {
268        return bufPos >= bufLength;
269    }
270
271    /**
272     * Get the char at the current position.
273     * @return char
274     */
275    public char current() {
276        bufferUp();
277        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
278    }
279
280    /**
281     Consume one character off the queue.
282     @return first character on queue, or EOF if the queue is empty.
283     */
284    public char consume() {
285        bufferUp();
286        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
287        bufPos++;
288        return val;
289    }
290
291    /**
292     Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
293     */
294    void unconsume() {
295        if (bufPos < 1)
296            throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it.
297
298        bufPos--;
299    }
300
301    /**
302     * Moves the current position by one.
303     */
304    public void advance() {
305        bufPos++;
306    }
307
308    /**
309     * Returns the number of characters between the current position and the next instance of the input char
310     * @param c scan target
311     * @return offset between current position and next instance of target. -1 if not found.
312     */
313    int nextIndexOf(char c) {
314        // doesn't handle scanning for surrogates
315        bufferUp();
316        for (int i = bufPos; i < bufLength; i++) {
317            if (c == charBuf[i])
318                return i - bufPos;
319        }
320        return -1;
321    }
322
323    /**
324     * Returns the number of characters between the current position and the next instance of the input sequence
325     *
326     * @param seq scan target
327     * @return offset between current position and next instance of target. -1 if not found.
328     */
329    int nextIndexOf(CharSequence seq) {
330        bufferUp();
331        // doesn't handle scanning for surrogates
332        char startChar = seq.charAt(0);
333        for (int offset = bufPos; offset < bufLength; offset++) {
334            // scan to first instance of startchar:
335            if (startChar != charBuf[offset])
336                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
337            int i = offset + 1;
338            int last = i + seq.length()-1;
339            if (offset < bufLength && last <= bufLength) {
340                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
341                if (i == last) // found full sequence
342                    return offset - bufPos;
343            }
344        }
345        return -1;
346    }
347
348    /**
349     * Reads characters up to the specific char.
350     * @param c the delimiter
351     * @return the chars read
352     */
353    public String consumeTo(char c) {
354        int offset = nextIndexOf(c);
355        if (offset != -1) {
356            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
357            bufPos += offset;
358            return consumed;
359        } else {
360            return consumeToEnd();
361        }
362    }
363
364    /**
365     Reads the characters up to (but not including) the specified case-sensitive string.
366     <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the
367     length of the sequence, such that this call may be repeated.
368     @param seq the delimiter
369     @return the chars read
370     */
371    public String consumeTo(String seq) {
372        int offset = nextIndexOf(seq);
373        if (offset != -1) {
374            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
375            bufPos += offset;
376            return consumed;
377        } else if (bufLength - bufPos < seq.length()) {
378            // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
379            return consumeToEnd();
380        } else {
381            // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
382            // unread in case they contain the beginning of the search string
383            int endPos = bufLength - seq.length() + 1;
384            String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
385            bufPos = endPos;
386            return consumed;
387        }
388    }
389
390    /**
391     Read characters while the input predicate returns true.
392     @return characters read
393     */
394    String consumeMatching(CharPredicate func) {
395        return consumeMatching(func, -1);
396    }
397
398    /**
399     Read characters while the input predicate returns true, up to a maximum length.
400     @param func predicate to test
401     @param maxLength maximum length to read. -1 indicates no maximum
402     @return characters read
403     */
404    String consumeMatching(CharPredicate func, int maxLength) {
405        bufferUp();
406        int pos = bufPos;
407        final int start = pos;
408        final int remaining = bufLength;
409        final char[] val = charBuf;
410
411        while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) {
412            pos++;
413        }
414
415        bufPos = pos;
416        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
417    }
418
419    /**
420     * Read characters until the first of any delimiters is found.
421     * @param chars delimiters to scan for
422     * @return characters read up to the matched delimiter.
423     */
424    public String consumeToAny(final char... chars) {
425        return consumeMatching(c -> { // seeks until we see one of the terminating chars
426            for (char seek : chars)
427                if (c == seek) return false;
428            return true;
429        });
430    }
431
432    String consumeToAnySorted(final char... chars) {
433        return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit
434    }
435
436    String consumeData() {
437        // consumes until &, <, null
438        return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar);
439    }
440
441    String consumeAttributeQuoted(final boolean single) {
442        // null, " or ', &
443        return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"'));
444    }
445
446    String consumeRawData() {
447        // <, null
448        return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar);
449    }
450
451    String consumeTagName() {
452        // '\t', '\n', '\r', '\f', ' ', '/', '>'
453        // NOTE: out of spec; does not stop and append on nullChar but eats
454        return consumeMatching(c -> {
455            switch (c) {
456                case '\t':
457                case '\n':
458                case '\r':
459                case '\f':
460                case ' ':
461                case '/':
462                case '>':
463                    return false;
464            }
465            return true;
466        });
467    }
468
469    String consumeToEnd() {
470        bufferUp();
471        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
472        bufPos = bufLength;
473        return data;
474    }
475
476    String consumeLetterSequence() {
477        return consumeMatching(Character::isLetter);
478    }
479
480    String consumeLetterThenDigitSequence() {
481        bufferUp();
482        int start = bufPos;
483        while (bufPos < bufLength) {
484            if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++;
485            else break;
486        }
487        while (!isEmptyNoBufferUp()) {
488            if (StringUtil.isDigit(charBuf[bufPos])) bufPos++;
489            else break;
490        }
491
492        return cacheString(charBuf, stringCache, start, bufPos - start);
493    }
494
495    String consumeHexSequence() {
496        return consumeMatching(StringUtil::isHexDigit);
497    }
498
499    String consumeDigitSequence() {
500        return consumeMatching(c -> c >= '0' && c <= '9');
501    }
502
503    boolean matches(char c) {
504        return !isEmpty() && charBuf[bufPos] == c;
505    }
506
507    boolean matches(String seq) {
508        bufferUp();
509        int scanLength = seq.length();
510        if (scanLength > bufLength - bufPos)
511            return false;
512
513        for (int offset = 0; offset < scanLength; offset++)
514            if (seq.charAt(offset) != charBuf[bufPos +offset])
515                return false;
516        return true;
517    }
518
519    boolean matchesIgnoreCase(String seq) {
520        bufferUp();
521        int scanLength = seq.length();
522        if (scanLength > bufLength - bufPos)
523            return false;
524
525        for (int offset = 0; offset < scanLength; offset++) {
526            char scan = seq.charAt(offset);
527            char target = charBuf[bufPos + offset];
528            if (scan == target) continue;
529
530            scan = Character.toUpperCase(scan);
531            target = Character.toUpperCase(target);
532            if (scan != target) return false;
533        }
534        return true;
535    }
536
537    /**
538     Tests if the next character in the queue matches any of the characters in the sequence, case sensitively.
539     @param seq list of characters to check for
540     @return true if any matched, false if none did
541     */
542    boolean matchesAny(char... seq) {
543        if (isEmpty())
544            return false;
545
546        bufferUp();
547        char c = charBuf[bufPos];
548        for (char seek : seq) {
549            if (seek == c)
550                return true;
551        }
552        return false;
553    }
554
555    boolean matchesAnySorted(char[] seq) {
556        bufferUp();
557        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
558    }
559
560    /**
561     Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
562     @return if it matches or not
563     */
564    boolean matchesAsciiAlpha() {
565        if (isEmpty()) return false;
566        return StringUtil.isAsciiLetter(charBuf[bufPos]);
567    }
568
569    boolean matchesDigit() {
570        if (isEmpty()) return false;
571        return StringUtil.isDigit(charBuf[bufPos]);
572    }
573
574    boolean matchConsume(String seq) {
575        bufferUp();
576        if (matches(seq)) {
577            bufPos += seq.length();
578            return true;
579        } else {
580            return false;
581        }
582    }
583
584    boolean matchConsumeIgnoreCase(String seq) {
585        if (matchesIgnoreCase(seq)) {
586            bufPos += seq.length();
587            return true;
588        } else {
589            return false;
590        }
591    }
592
593    // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
594    // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
595    // looking for the </title>. Resets in bufferUp()
596    @Nullable private String lastIcSeq; // scan cache
597    private int lastIcIndex; // nearest found indexOf
598
599    /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */
600    boolean containsIgnoreCase(String seq) {
601        if (seq.equals(lastIcSeq)) {
602            if (lastIcIndex == -1) return false;
603            if (lastIcIndex >= bufPos) return true;
604        }
605        lastIcSeq = seq;
606
607        String loScan = seq.toLowerCase(Locale.ENGLISH);
608        int lo = nextIndexOf(loScan);
609        if (lo > -1) {
610            lastIcIndex = bufPos + lo; return true;
611        }
612
613        String hiScan = seq.toUpperCase(Locale.ENGLISH);
614        int hi = nextIndexOf(hiScan);
615        boolean found = hi > -1;
616        lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains
617        return found;
618    }
619
620    @Override
621    public String toString() {
622        if (bufLength - bufPos < 0) return "";
623        return new String(charBuf, bufPos, bufLength - bufPos);
624    }
625
626    /**
627     * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks.
628     * <p />
629     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
630     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
631     * some more duplicates.
632     */
633    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
634        if (count > MaxStringCacheLen) // don't cache strings that are too big
635            return new String(charBuf, start, count);
636        if (count < 1)
637            return "";
638
639        // calculate hash:
640        int hash = 0;
641        int end = count + start;
642        for (int i = start; i < end; i++) {
643            hash = 31 * hash + charBuf[i];
644        }
645
646        // get from cache
647        final int index = hash & StringCacheSize - 1;
648        String cached = stringCache[index];
649
650        if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit
651            return cached;
652        else {
653            cached = new String(charBuf, start, count);
654            stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next
655        }
656
657        return cached;
658    }
659
660    /**
661     * Check if the value of the provided range equals the string.
662     */
663    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
664        if (count == cached.length()) {
665            int i = start;
666            int j = 0;
667            while (count-- != 0) {
668                if (charBuf[i++] != cached.charAt(j++))
669                    return false;
670            }
671            return true;
672        }
673        return false;
674    }
675
676    // just used for testing
677    boolean rangeEquals(final int start, final int count, final String cached) {
678        return rangeEquals(charBuf, start, count, cached);
679    }
680
681    @FunctionalInterface
682    interface CharPredicate {
683        boolean test(char c);
684    }
685}