001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SoftPool; 005import org.jsoup.internal.StringUtil; 006import org.jspecify.annotations.Nullable; 007 008import java.io.IOException; 009import java.io.UncheckedIOException; 010import java.io.Reader; 011import java.io.StringReader; 012import java.util.ArrayList; 013import java.util.Arrays; 014import java.util.Collections; 015import java.util.Locale; 016 017/** 018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 019 <p>If the underlying reader throws an IOException during any operation, the CharacterReader will throw an 020 {@link UncheckedIOException}. That won't happen with String / StringReader inputs.</p> 021 */ 022public final class CharacterReader implements AutoCloseable { 023 static final char EOF = (char) -1; 024 private static final int MaxStringCacheLen = 12; 025 private static final int StringCacheSize = 512; 026 private String[] stringCache; // holds reused strings in this doc, to lessen garbage 027 private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations 028 029 static final int BufferSize = 1024 * 2; // visible for testing 030 static final int RefillPoint = BufferSize / 2; // when bufPos characters read, refill; visible for testing 031 private static final int RewindLimit = 1024; // the maximum we can rewind. No HTML entities can be larger than this. 032 033 private Reader reader; // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader 034 private char[] charBuf; // character buffer we consume from; filled from Reader 035 private int bufPos; // position in charBuf that's been consumed to 036 private int bufLength; // the num of characters actually buffered in charBuf, <= charBuf.length 037 private int fillPoint = 0; // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp 038 private int consumed; // how many characters total have been consumed from this CharacterReader (less the current bufPos) 039 private int bufMark = -1; // if not -1, the marked rewind position 040 private boolean readFully; // if the underlying stream has been completely read, no value in further buffering 041 042 private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer 043 044 @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() 045 private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] 046 047 public CharacterReader(Reader input, int sz) { 048 this(input); // sz is no longer used 049 } 050 051 public CharacterReader(Reader input) { 052 Validate.notNull(input); 053 reader = input; 054 charBuf = BufferPool.borrow(); 055 stringCache = StringPool.borrow(); 056 bufferUp(); 057 } 058 059 public CharacterReader(String input) { 060 this(new StringReader(input)); 061 } 062 063 @Override 064 public void close() { 065 if (reader == null) 066 return; 067 try { 068 reader.close(); 069 } catch (IOException ignored) { 070 } finally { 071 reader = null; 072 Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer 073 BufferPool.release(charBuf); 074 charBuf = null; 075 StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents 076 stringCache = null; 077 } 078 } 079 080 private void bufferUp() { 081 if (readFully || bufPos < fillPoint || bufMark != -1) 082 return; 083 doBufferUp(); // structured so bufferUp may become an intrinsic candidate 084 } 085 086 /** 087 Reads into the buffer. Will throw an UncheckedIOException if the underling reader throws an IOException. 088 @throws UncheckedIOException if the underlying reader throws an IOException 089 */ 090 private void doBufferUp() { 091 /* 092 The flow: 093 - if read fully, or if bufPos < fillPoint, or if marked - do not fill. 094 - update readerPos (total amount consumed from this CharacterReader) += bufPos 095 - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount 096 - loop read the Reader until we fill charBuf. bufLength += read. 097 - readFully = true when read = -1 098 */ 099 consumed += bufPos; 100 bufLength -= bufPos; 101 if (bufLength > 0) 102 System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength); 103 bufPos = 0; 104 while (bufLength < BufferSize) { 105 try { 106 int read = reader.read(charBuf, bufLength, charBuf.length - bufLength); 107 if (read == -1) { 108 readFully = true; 109 break; 110 } 111 bufLength += read; 112 } catch (IOException e) { 113 throw new UncheckedIOException(e); 114 } 115 } 116 fillPoint = Math.min(bufLength, RefillPoint); 117 118 scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking 119 lastIcSeq = null; // cache for last containsIgnoreCase(seq) 120 } 121 122 void mark() { 123 // make sure there is enough look ahead capacity 124 if (bufLength - bufPos < RewindLimit) 125 fillPoint = 0; 126 127 bufferUp(); 128 bufMark = bufPos; 129 } 130 131 void unmark() { 132 bufMark = -1; 133 } 134 135 void rewindToMark() { 136 if (bufMark == -1) 137 throw new UncheckedIOException(new IOException("Mark invalid")); 138 139 bufPos = bufMark; 140 unmark(); 141 } 142 143 /** 144 * Gets the position currently read to in the content. Starts at 0. 145 * @return current position 146 */ 147 public int pos() { 148 return consumed + bufPos; 149 } 150 151 /** Tests if the buffer has been fully read. */ 152 boolean readFully() { 153 return readFully; 154 } 155 156 /** 157 Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the 158 legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of 159 use. 160 161 @param track set tracking on|off 162 @since 1.14.3 163 */ 164 public void trackNewlines(boolean track) { 165 if (track && newlinePositions == null) { 166 newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count 167 scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp 168 } 169 else if (!track) 170 newlinePositions = null; 171 } 172 173 /** 174 Check if the tracking of newlines is enabled. 175 @return the current newline tracking state 176 @since 1.14.3 177 */ 178 public boolean isTrackNewlines() { 179 return newlinePositions != null; 180 } 181 182 /** 183 Get the current line number (that the reader has consumed to). Starts at line #1. 184 @return the current line number, or 1 if line tracking is not enabled. 185 @since 1.14.3 186 @see #trackNewlines(boolean) 187 */ 188 public int lineNumber() { 189 return lineNumber(pos()); 190 } 191 192 int lineNumber(int pos) { 193 // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that 194 // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array 195 if (!isTrackNewlines()) 196 return 1; 197 198 int i = lineNumIndex(pos); 199 if (i == -1) 200 return lineNumberOffset; // first line 201 return i + lineNumberOffset + 1; 202 } 203 204 /** 205 Get the current column number (that the reader has consumed to). Starts at column #1. 206 @return the current column number 207 @since 1.14.3 208 @see #trackNewlines(boolean) 209 */ 210 public int columnNumber() { 211 return columnNumber(pos()); 212 } 213 214 int columnNumber(int pos) { 215 if (!isTrackNewlines()) 216 return pos + 1; 217 218 int i = lineNumIndex(pos); 219 if (i == -1) 220 return pos + 1; 221 return pos - newlinePositions.get(i) + 1; 222 } 223 224 /** 225 Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line 226 number 5 and column number 10. 227 @return line:col position 228 @since 1.14.3 229 @see #trackNewlines(boolean) 230 */ 231 String posLineCol() { 232 return lineNumber() + ":" + columnNumber(); 233 } 234 235 private int lineNumIndex(int pos) { 236 if (!isTrackNewlines()) return 0; 237 int i = Collections.binarySearch(newlinePositions, pos); 238 if (i < -1) i = Math.abs(i) - 2; 239 return i; 240 } 241 242 /** 243 Scans the buffer for newline position, and tracks their location in newlinePositions. 244 */ 245 private void scanBufferForNewlines() { 246 if (!isTrackNewlines()) 247 return; 248 249 if (newlinePositions.size() > 0) { 250 // work out the line number that we have read up to (as we have likely scanned past this point) 251 int index = lineNumIndex(consumed); 252 if (index == -1) index = 0; // first line 253 int linePos = newlinePositions.get(index); 254 lineNumberOffset += index; // the num lines we've read up to 255 newlinePositions.clear(); 256 newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer 257 } 258 259 for (int i = bufPos; i < bufLength; i++) { 260 if (charBuf[i] == '\n') 261 newlinePositions.add(1 + consumed + i); 262 } 263 } 264 265 /** 266 * Tests if all the content has been read. 267 * @return true if nothing left to read. 268 */ 269 public boolean isEmpty() { 270 bufferUp(); 271 return bufPos >= bufLength; 272 } 273 274 private boolean isEmptyNoBufferUp() { 275 return bufPos >= bufLength; 276 } 277 278 /** 279 * Get the char at the current position. 280 * @return char 281 */ 282 public char current() { 283 bufferUp(); 284 return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 285 } 286 287 /** 288 Consume one character off the queue. 289 @return first character on queue, or EOF if the queue is empty. 290 */ 291 public char consume() { 292 bufferUp(); 293 char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 294 bufPos++; 295 return val; 296 } 297 298 /** 299 Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp. 300 */ 301 void unconsume() { 302 if (bufPos < 1) 303 throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it. 304 305 bufPos--; 306 } 307 308 /** 309 * Moves the current position by one. 310 */ 311 public void advance() { 312 bufPos++; 313 } 314 315 /** 316 * Returns the number of characters between the current position and the next instance of the input char 317 * @param c scan target 318 * @return offset between current position and next instance of target. -1 if not found. 319 */ 320 int nextIndexOf(char c) { 321 // doesn't handle scanning for surrogates 322 bufferUp(); 323 for (int i = bufPos; i < bufLength; i++) { 324 if (c == charBuf[i]) 325 return i - bufPos; 326 } 327 return -1; 328 } 329 330 /** 331 * Returns the number of characters between the current position and the next instance of the input sequence 332 * 333 * @param seq scan target 334 * @return offset between current position and next instance of target. -1 if not found. 335 */ 336 int nextIndexOf(CharSequence seq) { 337 bufferUp(); 338 // doesn't handle scanning for surrogates 339 char startChar = seq.charAt(0); 340 for (int offset = bufPos; offset < bufLength; offset++) { 341 // scan to first instance of startchar: 342 if (startChar != charBuf[offset]) 343 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 344 int i = offset + 1; 345 int last = i + seq.length()-1; 346 if (offset < bufLength && last <= bufLength) { 347 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 348 if (i == last) // found full sequence 349 return offset - bufPos; 350 } 351 } 352 return -1; 353 } 354 355 /** 356 * Reads characters up to the specific char. 357 * @param c the delimiter 358 * @return the chars read 359 */ 360 public String consumeTo(char c) { 361 int offset = nextIndexOf(c); 362 if (offset != -1) { 363 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 364 bufPos += offset; 365 return consumed; 366 } else { 367 return consumeToEnd(); 368 } 369 } 370 371 /** 372 Reads the characters up to (but not including) the specified case-sensitive string. 373 <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the 374 length of the sequence, such that this call may be repeated. 375 @param seq the delimiter 376 @return the chars read 377 */ 378 public String consumeTo(String seq) { 379 int offset = nextIndexOf(seq); 380 if (offset != -1) { 381 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 382 bufPos += offset; 383 return consumed; 384 } else if (bufLength - bufPos < seq.length()) { 385 // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF 386 return consumeToEnd(); 387 } else { 388 // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters 389 // unread in case they contain the beginning of the search string 390 int endPos = bufLength - seq.length() + 1; 391 String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); 392 bufPos = endPos; 393 return consumed; 394 } 395 } 396 397 /** 398 Read characters while the input predicate returns true. 399 @return characters read 400 */ 401 String consumeMatching(CharPredicate func) { 402 return consumeMatching(func, -1); 403 } 404 405 /** 406 Read characters while the input predicate returns true, up to a maximum length. 407 @param func predicate to test 408 @param maxLength maximum length to read. -1 indicates no maximum 409 @return characters read 410 */ 411 String consumeMatching(CharPredicate func, int maxLength) { 412 bufferUp(); 413 int pos = bufPos; 414 final int start = pos; 415 final int remaining = bufLength; 416 final char[] val = charBuf; 417 418 while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) { 419 pos++; 420 } 421 422 bufPos = pos; 423 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 424 } 425 426 /** 427 * Read characters until the first of any delimiters is found. 428 * @param chars delimiters to scan for 429 * @return characters read up to the matched delimiter. 430 */ 431 public String consumeToAny(final char... chars) { 432 return consumeMatching(c -> { // seeks until we see one of the terminating chars 433 for (char seek : chars) 434 if (c == seek) return false; 435 return true; 436 }); 437 } 438 439 String consumeToAnySorted(final char... chars) { 440 return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit 441 } 442 443 String consumeData() { 444 // consumes until &, <, null 445 return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar); 446 } 447 448 String consumeAttributeQuoted(final boolean single) { 449 // null, " or ', & 450 return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"')); 451 } 452 453 String consumeRawData() { 454 // <, null 455 return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar); 456 } 457 458 String consumeTagName() { 459 // '\t', '\n', '\r', '\f', ' ', '/', '>' 460 // NOTE: out of spec; does not stop and append on nullChar but eats 461 return consumeMatching(c -> { 462 switch (c) { 463 case '\t': 464 case '\n': 465 case '\r': 466 case '\f': 467 case ' ': 468 case '/': 469 case '>': 470 return false; 471 } 472 return true; 473 }); 474 } 475 476 String consumeToEnd() { 477 bufferUp(); 478 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 479 bufPos = bufLength; 480 return data; 481 } 482 483 String consumeLetterSequence() { 484 return consumeMatching(Character::isLetter); 485 } 486 487 String consumeLetterThenDigitSequence() { 488 bufferUp(); 489 int start = bufPos; 490 while (bufPos < bufLength) { 491 if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++; 492 else break; 493 } 494 while (!isEmptyNoBufferUp()) { 495 if (StringUtil.isDigit(charBuf[bufPos])) bufPos++; 496 else break; 497 } 498 499 return cacheString(charBuf, stringCache, start, bufPos - start); 500 } 501 502 String consumeHexSequence() { 503 return consumeMatching(StringUtil::isHexDigit); 504 } 505 506 String consumeDigitSequence() { 507 return consumeMatching(c -> c >= '0' && c <= '9'); 508 } 509 510 boolean matches(char c) { 511 return !isEmpty() && charBuf[bufPos] == c; 512 } 513 514 boolean matches(String seq) { 515 bufferUp(); 516 int scanLength = seq.length(); 517 if (scanLength > bufLength - bufPos) 518 return false; 519 520 for (int offset = 0; offset < scanLength; offset++) 521 if (seq.charAt(offset) != charBuf[bufPos +offset]) 522 return false; 523 return true; 524 } 525 526 boolean matchesIgnoreCase(String seq) { 527 bufferUp(); 528 int scanLength = seq.length(); 529 if (scanLength > bufLength - bufPos) 530 return false; 531 532 for (int offset = 0; offset < scanLength; offset++) { 533 char scan = seq.charAt(offset); 534 char target = charBuf[bufPos + offset]; 535 if (scan == target) continue; 536 537 scan = Character.toUpperCase(scan); 538 target = Character.toUpperCase(target); 539 if (scan != target) return false; 540 } 541 return true; 542 } 543 544 /** 545 Tests if the next character in the queue matches any of the characters in the sequence, case sensitively. 546 @param seq list of characters to check for 547 @return true if any matched, false if none did 548 */ 549 boolean matchesAny(char... seq) { 550 if (isEmpty()) 551 return false; 552 553 bufferUp(); 554 char c = charBuf[bufPos]; 555 for (char seek : seq) { 556 if (seek == c) 557 return true; 558 } 559 return false; 560 } 561 562 boolean matchesAnySorted(char[] seq) { 563 bufferUp(); 564 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 565 } 566 567 /** 568 Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha 569 @return if it matches or not 570 */ 571 boolean matchesAsciiAlpha() { 572 if (isEmpty()) return false; 573 return StringUtil.isAsciiLetter(charBuf[bufPos]); 574 } 575 576 boolean matchesDigit() { 577 if (isEmpty()) return false; 578 return StringUtil.isDigit(charBuf[bufPos]); 579 } 580 581 boolean matchConsume(String seq) { 582 bufferUp(); 583 if (matches(seq)) { 584 bufPos += seq.length(); 585 return true; 586 } else { 587 return false; 588 } 589 } 590 591 boolean matchConsumeIgnoreCase(String seq) { 592 if (matchesIgnoreCase(seq)) { 593 bufPos += seq.length(); 594 return true; 595 } else { 596 return false; 597 } 598 } 599 600 // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans. 601 // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p 602 // looking for the </title>. Resets in bufferUp() 603 @Nullable private String lastIcSeq; // scan cache 604 private int lastIcIndex; // nearest found indexOf 605 606 /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */ 607 boolean containsIgnoreCase(String seq) { 608 if (seq.equals(lastIcSeq)) { 609 if (lastIcIndex == -1) return false; 610 if (lastIcIndex >= bufPos) return true; 611 } 612 lastIcSeq = seq; 613 614 String loScan = seq.toLowerCase(Locale.ENGLISH); 615 int lo = nextIndexOf(loScan); 616 if (lo > -1) { 617 lastIcIndex = bufPos + lo; return true; 618 } 619 620 String hiScan = seq.toUpperCase(Locale.ENGLISH); 621 int hi = nextIndexOf(hiScan); 622 boolean found = hi > -1; 623 lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains 624 return found; 625 } 626 627 @Override 628 public String toString() { 629 if (bufLength - bufPos < 0) return ""; 630 return new String(charBuf, bufPos, bufLength - bufPos); 631 } 632 633 /** 634 * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. 635 * <p /> 636 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 637 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 638 * some more duplicates. 639 */ 640 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 641 if (count > MaxStringCacheLen) // don't cache strings that are too big 642 return new String(charBuf, start, count); 643 if (count < 1) 644 return ""; 645 646 // calculate hash: 647 int hash = 0; 648 int end = count + start; 649 for (int i = start; i < end; i++) { 650 hash = 31 * hash + charBuf[i]; 651 } 652 653 // get from cache 654 final int index = hash & StringCacheSize - 1; 655 String cached = stringCache[index]; 656 657 if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit 658 return cached; 659 else { 660 cached = new String(charBuf, start, count); 661 stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next 662 } 663 664 return cached; 665 } 666 667 /** 668 * Check if the value of the provided range equals the string. 669 */ 670 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 671 if (count == cached.length()) { 672 int i = start; 673 int j = 0; 674 while (count-- != 0) { 675 if (charBuf[i++] != cached.charAt(j++)) 676 return false; 677 } 678 return true; 679 } 680 return false; 681 } 682 683 // just used for testing 684 boolean rangeEquals(final int start, final int count, final String cached) { 685 return rangeEquals(charBuf, start, count, cached); 686 } 687 688 @FunctionalInterface 689 interface CharPredicate { 690 boolean test(char c); 691 } 692}