001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SoftPool; 005import org.jspecify.annotations.Nullable; 006 007import java.io.IOException; 008import java.io.UncheckedIOException; 009import java.io.Reader; 010import java.io.StringReader; 011import java.util.ArrayList; 012import java.util.Arrays; 013import java.util.Collections; 014import java.util.Locale; 015 016/** 017 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 018 */ 019public final class CharacterReader { 020 static final char EOF = (char) -1; 021 private static final int MaxStringCacheLen = 12; 022 private static final int StringCacheSize = 512; 023 private String[] stringCache; // holds reused strings in this doc, to lessen garbage 024 private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations 025 026 static final int BufferSize = 1024 * 2; // visible for testing 027 static final int RefillPoint = BufferSize / 2; // when bufPos characters read, refill; visible for testing 028 private static final int RewindLimit = 1024; // the maximum we can rewind. No HTML entities can be larger than this. 029 030 private Reader reader; // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader 031 private char[] charBuf; // character buffer we consume from; filled from Reader 032 private int bufPos; // position in charBuf that's been consumed to 033 private int bufLength; // the num of characters actually buffered in charBuf, <= charBuf.length 034 private int fillPoint = 0; // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp 035 private int consumed; // how many characters total have been consumed from this CharacterReader (less the current bufPos) 036 private int bufMark = -1; // if not -1, the marked rewind position 037 private boolean readFully; // if the underlying stream has been completely read, no value in further buffering 038 039 private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer 040 041 @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() 042 private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] 043 044 public CharacterReader(Reader input, int sz) { 045 this(input); // sz is no longer used 046 } 047 048 public CharacterReader(Reader input) { 049 Validate.notNull(input); 050 reader = input; 051 charBuf = BufferPool.borrow(); 052 stringCache = StringPool.borrow(); 053 bufferUp(); 054 } 055 056 public CharacterReader(String input) { 057 this(new StringReader(input)); 058 } 059 060 public void close() { 061 if (reader == null) 062 return; 063 try { 064 reader.close(); 065 } catch (IOException ignored) { 066 } finally { 067 reader = null; 068 Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer 069 BufferPool.release(charBuf); 070 charBuf = null; 071 StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents 072 stringCache = null; 073 } 074 } 075 076 private void bufferUp() { 077 if (readFully || bufPos < fillPoint || bufMark != -1) 078 return; 079 doBufferUp(); // structured so bufferUp may become an intrinsic candidate 080 } 081 082 private void doBufferUp() { 083 /* 084 The flow: 085 - if read fully, or if bufPos < fillPoint, or if marked - do not fill. 086 - update readerPos (total amount consumed from this CharacterReader) += bufPos 087 - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount 088 - loop read the Reader until we fill charBuf. bufLength += read. 089 - readFully = true when read = -1 090 */ 091 consumed += bufPos; 092 bufLength -= bufPos; 093 if (bufLength > 0) 094 System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength); 095 bufPos = 0; 096 while (bufLength < BufferSize) { 097 try { 098 int read = reader.read(charBuf, bufLength, charBuf.length - bufLength); 099 if (read == -1) { 100 readFully = true; 101 break; 102 } 103 bufLength += read; 104 } catch (IOException e) { 105 throw new UncheckedIOException(e); 106 } 107 } 108 fillPoint = Math.min(bufLength, RefillPoint); 109 110 scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking 111 lastIcSeq = null; // cache for last containsIgnoreCase(seq) 112 } 113 114 void mark() { 115 // make sure there is enough look ahead capacity 116 if (bufLength - bufPos < RewindLimit) 117 fillPoint = 0; 118 119 bufferUp(); 120 bufMark = bufPos; 121 } 122 123 void unmark() { 124 bufMark = -1; 125 } 126 127 void rewindToMark() { 128 if (bufMark == -1) 129 throw new UncheckedIOException(new IOException("Mark invalid")); 130 131 bufPos = bufMark; 132 unmark(); 133 } 134 135 /** 136 * Gets the position currently read to in the content. Starts at 0. 137 * @return current position 138 */ 139 public int pos() { 140 return consumed + bufPos; 141 } 142 143 /** Tests if the buffer has been fully read. */ 144 boolean readFully() { 145 return readFully; 146 } 147 148 /** 149 Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the 150 legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of 151 use. 152 153 @param track set tracking on|off 154 @since 1.14.3 155 */ 156 public void trackNewlines(boolean track) { 157 if (track && newlinePositions == null) { 158 newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count 159 scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp 160 } 161 else if (!track) 162 newlinePositions = null; 163 } 164 165 /** 166 Check if the tracking of newlines is enabled. 167 @return the current newline tracking state 168 @since 1.14.3 169 */ 170 public boolean isTrackNewlines() { 171 return newlinePositions != null; 172 } 173 174 /** 175 Get the current line number (that the reader has consumed to). Starts at line #1. 176 @return the current line number, or 1 if line tracking is not enabled. 177 @since 1.14.3 178 @see #trackNewlines(boolean) 179 */ 180 public int lineNumber() { 181 return lineNumber(pos()); 182 } 183 184 int lineNumber(int pos) { 185 // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that 186 // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array 187 if (!isTrackNewlines()) 188 return 1; 189 190 int i = lineNumIndex(pos); 191 if (i == -1) 192 return lineNumberOffset; // first line 193 return i + lineNumberOffset + 1; 194 } 195 196 /** 197 Get the current column number (that the reader has consumed to). Starts at column #1. 198 @return the current column number 199 @since 1.14.3 200 @see #trackNewlines(boolean) 201 */ 202 public int columnNumber() { 203 return columnNumber(pos()); 204 } 205 206 int columnNumber(int pos) { 207 if (!isTrackNewlines()) 208 return pos + 1; 209 210 int i = lineNumIndex(pos); 211 if (i == -1) 212 return pos + 1; 213 return pos - newlinePositions.get(i) + 1; 214 } 215 216 /** 217 Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line 218 number 5 and column number 10. 219 @return line:col position 220 @since 1.14.3 221 @see #trackNewlines(boolean) 222 */ 223 String posLineCol() { 224 return lineNumber() + ":" + columnNumber(); 225 } 226 227 private int lineNumIndex(int pos) { 228 if (!isTrackNewlines()) return 0; 229 int i = Collections.binarySearch(newlinePositions, pos); 230 if (i < -1) i = Math.abs(i) - 2; 231 return i; 232 } 233 234 /** 235 Scans the buffer for newline position, and tracks their location in newlinePositions. 236 */ 237 private void scanBufferForNewlines() { 238 if (!isTrackNewlines()) 239 return; 240 241 if (newlinePositions.size() > 0) { 242 // work out the line number that we have read up to (as we have likely scanned past this point) 243 int index = lineNumIndex(consumed); 244 if (index == -1) index = 0; // first line 245 int linePos = newlinePositions.get(index); 246 lineNumberOffset += index; // the num lines we've read up to 247 newlinePositions.clear(); 248 newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer 249 } 250 251 for (int i = bufPos; i < bufLength; i++) { 252 if (charBuf[i] == '\n') 253 newlinePositions.add(1 + consumed + i); 254 } 255 } 256 257 /** 258 * Tests if all the content has been read. 259 * @return true if nothing left to read. 260 */ 261 public boolean isEmpty() { 262 bufferUp(); 263 return bufPos >= bufLength; 264 } 265 266 private boolean isEmptyNoBufferUp() { 267 return bufPos >= bufLength; 268 } 269 270 /** 271 * Get the char at the current position. 272 * @return char 273 */ 274 public char current() { 275 bufferUp(); 276 return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 277 } 278 279 char consume() { 280 bufferUp(); 281 char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 282 bufPos++; 283 return val; 284 } 285 286 /** 287 Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp. 288 */ 289 void unconsume() { 290 if (bufPos < 1) 291 throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it. 292 293 bufPos--; 294 } 295 296 /** 297 * Moves the current position by one. 298 */ 299 public void advance() { 300 bufPos++; 301 } 302 303 /** 304 * Returns the number of characters between the current position and the next instance of the input char 305 * @param c scan target 306 * @return offset between current position and next instance of target. -1 if not found. 307 */ 308 int nextIndexOf(char c) { 309 // doesn't handle scanning for surrogates 310 bufferUp(); 311 for (int i = bufPos; i < bufLength; i++) { 312 if (c == charBuf[i]) 313 return i - bufPos; 314 } 315 return -1; 316 } 317 318 /** 319 * Returns the number of characters between the current position and the next instance of the input sequence 320 * 321 * @param seq scan target 322 * @return offset between current position and next instance of target. -1 if not found. 323 */ 324 int nextIndexOf(CharSequence seq) { 325 bufferUp(); 326 // doesn't handle scanning for surrogates 327 char startChar = seq.charAt(0); 328 for (int offset = bufPos; offset < bufLength; offset++) { 329 // scan to first instance of startchar: 330 if (startChar != charBuf[offset]) 331 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 332 int i = offset + 1; 333 int last = i + seq.length()-1; 334 if (offset < bufLength && last <= bufLength) { 335 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 336 if (i == last) // found full sequence 337 return offset - bufPos; 338 } 339 } 340 return -1; 341 } 342 343 /** 344 * Reads characters up to the specific char. 345 * @param c the delimiter 346 * @return the chars read 347 */ 348 public String consumeTo(char c) { 349 int offset = nextIndexOf(c); 350 if (offset != -1) { 351 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 352 bufPos += offset; 353 return consumed; 354 } else { 355 return consumeToEnd(); 356 } 357 } 358 359 String consumeTo(String seq) { 360 int offset = nextIndexOf(seq); 361 if (offset != -1) { 362 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 363 bufPos += offset; 364 return consumed; 365 } else if (bufLength - bufPos < seq.length()) { 366 // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF 367 return consumeToEnd(); 368 } else { 369 // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters 370 // unread in case they contain the beginning of the search string 371 int endPos = bufLength - seq.length() + 1; 372 String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); 373 bufPos = endPos; 374 return consumed; 375 } 376 } 377 378 /** 379 * Read characters until the first of any delimiters is found. 380 * @param chars delimiters to scan for 381 * @return characters read up to the matched delimiter. 382 */ 383 public String consumeToAny(final char... chars) { 384 bufferUp(); 385 int pos = bufPos; 386 final int start = pos; 387 final int remaining = bufLength; 388 final char[] val = charBuf; 389 final int charLen = chars.length; 390 int i; 391 392 OUTER: while (pos < remaining) { 393 for (i = 0; i < charLen; i++) { 394 if (val[pos] == chars[i]) 395 break OUTER; 396 } 397 pos++; 398 } 399 400 bufPos = pos; 401 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 402 } 403 404 String consumeToAnySorted(final char... chars) { 405 bufferUp(); 406 int pos = bufPos; 407 final int start = pos; 408 final int remaining = bufLength; 409 final char[] val = charBuf; 410 411 while (pos < remaining) { 412 if (Arrays.binarySearch(chars, val[pos]) >= 0) 413 break; 414 pos++; 415 } 416 bufPos = pos; 417 return bufPos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 418 } 419 420 String consumeData() { 421 // &, <, null 422 //bufferUp(); // no need to bufferUp, just called consume() 423 int pos = bufPos; 424 final int start = pos; 425 final int remaining = bufLength; 426 final char[] val = charBuf; 427 428 OUTER: while (pos < remaining) { 429 switch (val[pos]) { 430 case '&': 431 case '<': 432 case TokeniserState.nullChar: 433 break OUTER; 434 default: 435 pos++; 436 } 437 } 438 bufPos = pos; 439 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 440 } 441 442 String consumeAttributeQuoted(final boolean single) { 443 // null, " or ', & 444 //bufferUp(); // no need to bufferUp, just called consume() 445 int pos = bufPos; 446 final int start = pos; 447 final int remaining = bufLength; 448 final char[] val = charBuf; 449 450 OUTER: while (pos < remaining) { 451 switch (val[pos]) { 452 case '&': 453 case TokeniserState.nullChar: 454 break OUTER; 455 case '\'': 456 if (single) break OUTER; 457 break; 458 case '"': 459 if (!single) break OUTER; 460 break; 461 } 462 pos++; 463 } 464 bufPos = pos; 465 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 466 } 467 468 469 String consumeRawData() { 470 // <, null 471 //bufferUp(); // no need to bufferUp, just called consume() 472 int pos = bufPos; 473 final int start = pos; 474 final int remaining = bufLength; 475 final char[] val = charBuf; 476 477 OUTER: while (pos < remaining) { 478 switch (val[pos]) { 479 case '<': 480 case TokeniserState.nullChar: 481 break OUTER; 482 default: 483 pos++; 484 } 485 } 486 bufPos = pos; 487 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 488 } 489 490 String consumeTagName() { 491 // '\t', '\n', '\r', '\f', ' ', '/', '>' 492 // NOTE: out of spec; does not stop and append on nullChar but eats 493 bufferUp(); 494 int pos = bufPos; 495 final int start = pos; 496 final int remaining = bufLength; 497 final char[] val = charBuf; 498 499 OUTER: while (pos < remaining) { 500 switch (val[pos]) { 501 case '\t': 502 case '\n': 503 case '\r': 504 case '\f': 505 case ' ': 506 case '/': 507 case '>': 508 break OUTER; 509 } 510 pos++; 511 } 512 513 bufPos = pos; 514 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 515 } 516 517 String consumeToEnd() { 518 bufferUp(); 519 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 520 bufPos = bufLength; 521 return data; 522 } 523 524 String consumeLetterSequence() { 525 bufferUp(); 526 int start = bufPos; 527 while (bufPos < bufLength) { 528 char c = charBuf[bufPos]; 529 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 530 bufPos++; 531 else 532 break; 533 } 534 535 return cacheString(charBuf, stringCache, start, bufPos - start); 536 } 537 538 String consumeLetterThenDigitSequence() { 539 bufferUp(); 540 int start = bufPos; 541 while (bufPos < bufLength) { 542 char c = charBuf[bufPos]; 543 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 544 bufPos++; 545 else 546 break; 547 } 548 while (!isEmptyNoBufferUp()) { 549 char c = charBuf[bufPos]; 550 if (c >= '0' && c <= '9') 551 bufPos++; 552 else 553 break; 554 } 555 556 return cacheString(charBuf, stringCache, start, bufPos - start); 557 } 558 559 String consumeHexSequence() { 560 bufferUp(); 561 int start = bufPos; 562 while (bufPos < bufLength) { 563 char c = charBuf[bufPos]; 564 if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) 565 bufPos++; 566 else 567 break; 568 } 569 return cacheString(charBuf, stringCache, start, bufPos - start); 570 } 571 572 String consumeDigitSequence() { 573 bufferUp(); 574 int start = bufPos; 575 while (bufPos < bufLength) { 576 char c = charBuf[bufPos]; 577 if (c >= '0' && c <= '9') 578 bufPos++; 579 else 580 break; 581 } 582 return cacheString(charBuf, stringCache, start, bufPos - start); 583 } 584 585 boolean matches(char c) { 586 return !isEmpty() && charBuf[bufPos] == c; 587 588 } 589 590 boolean matches(String seq) { 591 bufferUp(); 592 int scanLength = seq.length(); 593 if (scanLength > bufLength - bufPos) 594 return false; 595 596 for (int offset = 0; offset < scanLength; offset++) 597 if (seq.charAt(offset) != charBuf[bufPos +offset]) 598 return false; 599 return true; 600 } 601 602 boolean matchesIgnoreCase(String seq) { 603 bufferUp(); 604 int scanLength = seq.length(); 605 if (scanLength > bufLength - bufPos) 606 return false; 607 608 for (int offset = 0; offset < scanLength; offset++) { 609 char upScan = Character.toUpperCase(seq.charAt(offset)); 610 char upTarget = Character.toUpperCase(charBuf[bufPos + offset]); 611 if (upScan != upTarget) 612 return false; 613 } 614 return true; 615 } 616 617 boolean matchesAny(char... seq) { 618 if (isEmpty()) 619 return false; 620 621 bufferUp(); 622 char c = charBuf[bufPos]; 623 for (char seek : seq) { 624 if (seek == c) 625 return true; 626 } 627 return false; 628 } 629 630 boolean matchesAnySorted(char[] seq) { 631 bufferUp(); 632 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 633 } 634 635 boolean matchesLetter() { 636 if (isEmpty()) 637 return false; 638 char c = charBuf[bufPos]; 639 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c); 640 } 641 642 /** 643 Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha 644 @return if it matches or not 645 */ 646 boolean matchesAsciiAlpha() { 647 if (isEmpty()) 648 return false; 649 char c = charBuf[bufPos]; 650 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 651 } 652 653 boolean matchesDigit() { 654 if (isEmpty()) 655 return false; 656 char c = charBuf[bufPos]; 657 return (c >= '0' && c <= '9'); 658 } 659 660 boolean matchConsume(String seq) { 661 bufferUp(); 662 if (matches(seq)) { 663 bufPos += seq.length(); 664 return true; 665 } else { 666 return false; 667 } 668 } 669 670 boolean matchConsumeIgnoreCase(String seq) { 671 if (matchesIgnoreCase(seq)) { 672 bufPos += seq.length(); 673 return true; 674 } else { 675 return false; 676 } 677 } 678 679 // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans. 680 // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p 681 // looking for the </title>. Resets in bufferUp() 682 @Nullable private String lastIcSeq; // scan cache 683 private int lastIcIndex; // nearest found indexOf 684 685 /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */ 686 boolean containsIgnoreCase(String seq) { 687 if (seq.equals(lastIcSeq)) { 688 if (lastIcIndex == -1) return false; 689 if (lastIcIndex >= bufPos) return true; 690 } 691 lastIcSeq = seq; 692 693 String loScan = seq.toLowerCase(Locale.ENGLISH); 694 int lo = nextIndexOf(loScan); 695 if (lo > -1) { 696 lastIcIndex = bufPos + lo; return true; 697 } 698 699 String hiScan = seq.toUpperCase(Locale.ENGLISH); 700 int hi = nextIndexOf(hiScan); 701 boolean found = hi > -1; 702 lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains 703 return found; 704 } 705 706 @Override 707 public String toString() { 708 if (bufLength - bufPos < 0) 709 return ""; 710 return new String(charBuf, bufPos, bufLength - bufPos); 711 } 712 713 /** 714 * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. 715 * <p /> 716 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 717 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 718 * some more duplicates. 719 */ 720 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 721 if (count > MaxStringCacheLen) // don't cache strings that are too big 722 return new String(charBuf, start, count); 723 if (count < 1) 724 return ""; 725 726 // calculate hash: 727 int hash = 0; 728 int end = count + start; 729 for (int i = start; i < end; i++) { 730 hash = 31 * hash + charBuf[i]; 731 } 732 733 // get from cache 734 final int index = hash & StringCacheSize - 1; 735 String cached = stringCache[index]; 736 737 if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit 738 return cached; 739 else { 740 cached = new String(charBuf, start, count); 741 stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next 742 } 743 744 return cached; 745 } 746 747 /** 748 * Check if the value of the provided range equals the string. 749 */ 750 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 751 if (count == cached.length()) { 752 int i = start; 753 int j = 0; 754 while (count-- != 0) { 755 if (charBuf[i++] != cached.charAt(j++)) 756 return false; 757 } 758 return true; 759 } 760 return false; 761 } 762 763 // just used for testing 764 boolean rangeEquals(final int start, final int count, final String cached) { 765 return rangeEquals(charBuf, start, count, cached); 766 } 767}