001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.SoftPool; 005import org.jsoup.internal.StringUtil; 006import org.jspecify.annotations.Nullable; 007 008import java.io.IOException; 009import java.io.UncheckedIOException; 010import java.io.Reader; 011import java.io.StringReader; 012import java.util.ArrayList; 013import java.util.Arrays; 014import java.util.Collections; 015import java.util.Locale; 016 017/** 018 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 019 */ 020public final class CharacterReader { 021 static final char EOF = (char) -1; 022 private static final int MaxStringCacheLen = 12; 023 private static final int StringCacheSize = 512; 024 private String[] stringCache; // holds reused strings in this doc, to lessen garbage 025 private static final SoftPool<String[]> StringPool = new SoftPool<>(() -> new String[StringCacheSize]); // reuse cache between iterations 026 027 static final int BufferSize = 1024 * 2; // visible for testing 028 static final int RefillPoint = BufferSize / 2; // when bufPos characters read, refill; visible for testing 029 private static final int RewindLimit = 1024; // the maximum we can rewind. No HTML entities can be larger than this. 030 031 private Reader reader; // underlying Reader, will be backed by a buffered+controlled input stream, or StringReader 032 private char[] charBuf; // character buffer we consume from; filled from Reader 033 private int bufPos; // position in charBuf that's been consumed to 034 private int bufLength; // the num of characters actually buffered in charBuf, <= charBuf.length 035 private int fillPoint = 0; // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp 036 private int consumed; // how many characters total have been consumed from this CharacterReader (less the current bufPos) 037 private int bufMark = -1; // if not -1, the marked rewind position 038 private boolean readFully; // if the underlying stream has been completely read, no value in further buffering 039 040 private static final SoftPool<char[]> BufferPool = new SoftPool<>(() -> new char[BufferSize]); // recycled char buffer 041 042 @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() 043 private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] 044 045 public CharacterReader(Reader input, int sz) { 046 this(input); // sz is no longer used 047 } 048 049 public CharacterReader(Reader input) { 050 Validate.notNull(input); 051 reader = input; 052 charBuf = BufferPool.borrow(); 053 stringCache = StringPool.borrow(); 054 bufferUp(); 055 } 056 057 public CharacterReader(String input) { 058 this(new StringReader(input)); 059 } 060 061 public void close() { 062 if (reader == null) 063 return; 064 try { 065 reader.close(); 066 } catch (IOException ignored) { 067 } finally { 068 reader = null; 069 Arrays.fill(charBuf, (char) 0); // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer 070 BufferPool.release(charBuf); 071 charBuf = null; 072 StringPool.release(stringCache); // conversely, we don't clear the string cache, so we can reuse the contents 073 stringCache = null; 074 } 075 } 076 077 private void bufferUp() { 078 if (readFully || bufPos < fillPoint || bufMark != -1) 079 return; 080 doBufferUp(); // structured so bufferUp may become an intrinsic candidate 081 } 082 083 private void doBufferUp() { 084 /* 085 The flow: 086 - if read fully, or if bufPos < fillPoint, or if marked - do not fill. 087 - update readerPos (total amount consumed from this CharacterReader) += bufPos 088 - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount 089 - loop read the Reader until we fill charBuf. bufLength += read. 090 - readFully = true when read = -1 091 */ 092 consumed += bufPos; 093 bufLength -= bufPos; 094 if (bufLength > 0) 095 System.arraycopy(charBuf, bufPos, charBuf, 0, bufLength); 096 bufPos = 0; 097 while (bufLength < BufferSize) { 098 try { 099 int read = reader.read(charBuf, bufLength, charBuf.length - bufLength); 100 if (read == -1) { 101 readFully = true; 102 break; 103 } 104 bufLength += read; 105 } catch (IOException e) { 106 throw new UncheckedIOException(e); 107 } 108 } 109 fillPoint = Math.min(bufLength, RefillPoint); 110 111 scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking 112 lastIcSeq = null; // cache for last containsIgnoreCase(seq) 113 } 114 115 void mark() { 116 // make sure there is enough look ahead capacity 117 if (bufLength - bufPos < RewindLimit) 118 fillPoint = 0; 119 120 bufferUp(); 121 bufMark = bufPos; 122 } 123 124 void unmark() { 125 bufMark = -1; 126 } 127 128 void rewindToMark() { 129 if (bufMark == -1) 130 throw new UncheckedIOException(new IOException("Mark invalid")); 131 132 bufPos = bufMark; 133 unmark(); 134 } 135 136 /** 137 * Gets the position currently read to in the content. Starts at 0. 138 * @return current position 139 */ 140 public int pos() { 141 return consumed + bufPos; 142 } 143 144 /** Tests if the buffer has been fully read. */ 145 boolean readFully() { 146 return readFully; 147 } 148 149 /** 150 Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the 151 legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of 152 use. 153 154 @param track set tracking on|off 155 @since 1.14.3 156 */ 157 public void trackNewlines(boolean track) { 158 if (track && newlinePositions == null) { 159 newlinePositions = new ArrayList<>(BufferSize / 80); // rough guess of likely count 160 scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp 161 } 162 else if (!track) 163 newlinePositions = null; 164 } 165 166 /** 167 Check if the tracking of newlines is enabled. 168 @return the current newline tracking state 169 @since 1.14.3 170 */ 171 public boolean isTrackNewlines() { 172 return newlinePositions != null; 173 } 174 175 /** 176 Get the current line number (that the reader has consumed to). Starts at line #1. 177 @return the current line number, or 1 if line tracking is not enabled. 178 @since 1.14.3 179 @see #trackNewlines(boolean) 180 */ 181 public int lineNumber() { 182 return lineNumber(pos()); 183 } 184 185 int lineNumber(int pos) { 186 // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that 187 // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array 188 if (!isTrackNewlines()) 189 return 1; 190 191 int i = lineNumIndex(pos); 192 if (i == -1) 193 return lineNumberOffset; // first line 194 return i + lineNumberOffset + 1; 195 } 196 197 /** 198 Get the current column number (that the reader has consumed to). Starts at column #1. 199 @return the current column number 200 @since 1.14.3 201 @see #trackNewlines(boolean) 202 */ 203 public int columnNumber() { 204 return columnNumber(pos()); 205 } 206 207 int columnNumber(int pos) { 208 if (!isTrackNewlines()) 209 return pos + 1; 210 211 int i = lineNumIndex(pos); 212 if (i == -1) 213 return pos + 1; 214 return pos - newlinePositions.get(i) + 1; 215 } 216 217 /** 218 Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line 219 number 5 and column number 10. 220 @return line:col position 221 @since 1.14.3 222 @see #trackNewlines(boolean) 223 */ 224 String posLineCol() { 225 return lineNumber() + ":" + columnNumber(); 226 } 227 228 private int lineNumIndex(int pos) { 229 if (!isTrackNewlines()) return 0; 230 int i = Collections.binarySearch(newlinePositions, pos); 231 if (i < -1) i = Math.abs(i) - 2; 232 return i; 233 } 234 235 /** 236 Scans the buffer for newline position, and tracks their location in newlinePositions. 237 */ 238 private void scanBufferForNewlines() { 239 if (!isTrackNewlines()) 240 return; 241 242 if (newlinePositions.size() > 0) { 243 // work out the line number that we have read up to (as we have likely scanned past this point) 244 int index = lineNumIndex(consumed); 245 if (index == -1) index = 0; // first line 246 int linePos = newlinePositions.get(index); 247 lineNumberOffset += index; // the num lines we've read up to 248 newlinePositions.clear(); 249 newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer 250 } 251 252 for (int i = bufPos; i < bufLength; i++) { 253 if (charBuf[i] == '\n') 254 newlinePositions.add(1 + consumed + i); 255 } 256 } 257 258 /** 259 * Tests if all the content has been read. 260 * @return true if nothing left to read. 261 */ 262 public boolean isEmpty() { 263 bufferUp(); 264 return bufPos >= bufLength; 265 } 266 267 private boolean isEmptyNoBufferUp() { 268 return bufPos >= bufLength; 269 } 270 271 /** 272 * Get the char at the current position. 273 * @return char 274 */ 275 public char current() { 276 bufferUp(); 277 return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 278 } 279 280 /** 281 Consume one character off the queue. 282 @return first character on queue, or EOF if the queue is empty. 283 */ 284 public char consume() { 285 bufferUp(); 286 char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 287 bufPos++; 288 return val; 289 } 290 291 /** 292 Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp. 293 */ 294 void unconsume() { 295 if (bufPos < 1) 296 throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it. 297 298 bufPos--; 299 } 300 301 /** 302 * Moves the current position by one. 303 */ 304 public void advance() { 305 bufPos++; 306 } 307 308 /** 309 * Returns the number of characters between the current position and the next instance of the input char 310 * @param c scan target 311 * @return offset between current position and next instance of target. -1 if not found. 312 */ 313 int nextIndexOf(char c) { 314 // doesn't handle scanning for surrogates 315 bufferUp(); 316 for (int i = bufPos; i < bufLength; i++) { 317 if (c == charBuf[i]) 318 return i - bufPos; 319 } 320 return -1; 321 } 322 323 /** 324 * Returns the number of characters between the current position and the next instance of the input sequence 325 * 326 * @param seq scan target 327 * @return offset between current position and next instance of target. -1 if not found. 328 */ 329 int nextIndexOf(CharSequence seq) { 330 bufferUp(); 331 // doesn't handle scanning for surrogates 332 char startChar = seq.charAt(0); 333 for (int offset = bufPos; offset < bufLength; offset++) { 334 // scan to first instance of startchar: 335 if (startChar != charBuf[offset]) 336 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 337 int i = offset + 1; 338 int last = i + seq.length()-1; 339 if (offset < bufLength && last <= bufLength) { 340 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 341 if (i == last) // found full sequence 342 return offset - bufPos; 343 } 344 } 345 return -1; 346 } 347 348 /** 349 * Reads characters up to the specific char. 350 * @param c the delimiter 351 * @return the chars read 352 */ 353 public String consumeTo(char c) { 354 int offset = nextIndexOf(c); 355 if (offset != -1) { 356 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 357 bufPos += offset; 358 return consumed; 359 } else { 360 return consumeToEnd(); 361 } 362 } 363 364 /** 365 Reads the characters up to (but not including) the specified case-sensitive string. 366 <p>If the sequence is not found in the buffer, will return the remainder of the current buffered amount, less the 367 length of the sequence, such that this call may be repeated. 368 @param seq the delimiter 369 @return the chars read 370 */ 371 public String consumeTo(String seq) { 372 int offset = nextIndexOf(seq); 373 if (offset != -1) { 374 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 375 bufPos += offset; 376 return consumed; 377 } else if (bufLength - bufPos < seq.length()) { 378 // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF 379 return consumeToEnd(); 380 } else { 381 // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters 382 // unread in case they contain the beginning of the search string 383 int endPos = bufLength - seq.length() + 1; 384 String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); 385 bufPos = endPos; 386 return consumed; 387 } 388 } 389 390 /** 391 Read characters while the input predicate returns true. 392 @return characters read 393 */ 394 String consumeMatching(CharPredicate func) { 395 return consumeMatching(func, -1); 396 } 397 398 /** 399 Read characters while the input predicate returns true, up to a maximum length. 400 @param func predicate to test 401 @param maxLength maximum length to read. -1 indicates no maximum 402 @return characters read 403 */ 404 String consumeMatching(CharPredicate func, int maxLength) { 405 bufferUp(); 406 int pos = bufPos; 407 final int start = pos; 408 final int remaining = bufLength; 409 final char[] val = charBuf; 410 411 while (pos < remaining && (maxLength == -1 || pos - start < maxLength) && func.test(val[pos])) { 412 pos++; 413 } 414 415 bufPos = pos; 416 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 417 } 418 419 /** 420 * Read characters until the first of any delimiters is found. 421 * @param chars delimiters to scan for 422 * @return characters read up to the matched delimiter. 423 */ 424 public String consumeToAny(final char... chars) { 425 return consumeMatching(c -> { // seeks until we see one of the terminating chars 426 for (char seek : chars) 427 if (c == seek) return false; 428 return true; 429 }); 430 } 431 432 String consumeToAnySorted(final char... chars) { 433 return consumeMatching(c -> Arrays.binarySearch(chars, c) < 0); // matches until a hit 434 } 435 436 String consumeData() { 437 // consumes until &, <, null 438 return consumeMatching(c -> c != '&' && c != '<' && c != TokeniserState.nullChar); 439 } 440 441 String consumeAttributeQuoted(final boolean single) { 442 // null, " or ', & 443 return consumeMatching(c -> c != TokeniserState.nullChar && c != '&' && (single ? c != '\'' : c != '"')); 444 } 445 446 String consumeRawData() { 447 // <, null 448 return consumeMatching(c -> c != '<' && c != TokeniserState.nullChar); 449 } 450 451 String consumeTagName() { 452 // '\t', '\n', '\r', '\f', ' ', '/', '>' 453 // NOTE: out of spec; does not stop and append on nullChar but eats 454 return consumeMatching(c -> { 455 switch (c) { 456 case '\t': 457 case '\n': 458 case '\r': 459 case '\f': 460 case ' ': 461 case '/': 462 case '>': 463 return false; 464 } 465 return true; 466 }); 467 } 468 469 String consumeToEnd() { 470 bufferUp(); 471 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 472 bufPos = bufLength; 473 return data; 474 } 475 476 String consumeLetterSequence() { 477 return consumeMatching(Character::isLetter); 478 } 479 480 String consumeLetterThenDigitSequence() { 481 bufferUp(); 482 int start = bufPos; 483 while (bufPos < bufLength) { 484 if (StringUtil.isAsciiLetter(charBuf[bufPos])) bufPos++; 485 else break; 486 } 487 while (!isEmptyNoBufferUp()) { 488 if (StringUtil.isDigit(charBuf[bufPos])) bufPos++; 489 else break; 490 } 491 492 return cacheString(charBuf, stringCache, start, bufPos - start); 493 } 494 495 String consumeHexSequence() { 496 return consumeMatching(StringUtil::isHexDigit); 497 } 498 499 String consumeDigitSequence() { 500 return consumeMatching(c -> c >= '0' && c <= '9'); 501 } 502 503 boolean matches(char c) { 504 return !isEmpty() && charBuf[bufPos] == c; 505 } 506 507 boolean matches(String seq) { 508 bufferUp(); 509 int scanLength = seq.length(); 510 if (scanLength > bufLength - bufPos) 511 return false; 512 513 for (int offset = 0; offset < scanLength; offset++) 514 if (seq.charAt(offset) != charBuf[bufPos +offset]) 515 return false; 516 return true; 517 } 518 519 boolean matchesIgnoreCase(String seq) { 520 bufferUp(); 521 int scanLength = seq.length(); 522 if (scanLength > bufLength - bufPos) 523 return false; 524 525 for (int offset = 0; offset < scanLength; offset++) { 526 char scan = seq.charAt(offset); 527 char target = charBuf[bufPos + offset]; 528 if (scan == target) continue; 529 530 scan = Character.toUpperCase(scan); 531 target = Character.toUpperCase(target); 532 if (scan != target) return false; 533 } 534 return true; 535 } 536 537 /** 538 Tests if the next character in the queue matches any of the characters in the sequence, case sensitively. 539 @param seq list of characters to check for 540 @return true if any matched, false if none did 541 */ 542 boolean matchesAny(char... seq) { 543 if (isEmpty()) 544 return false; 545 546 bufferUp(); 547 char c = charBuf[bufPos]; 548 for (char seek : seq) { 549 if (seek == c) 550 return true; 551 } 552 return false; 553 } 554 555 boolean matchesAnySorted(char[] seq) { 556 bufferUp(); 557 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 558 } 559 560 /** 561 Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha 562 @return if it matches or not 563 */ 564 boolean matchesAsciiAlpha() { 565 if (isEmpty()) return false; 566 return StringUtil.isAsciiLetter(charBuf[bufPos]); 567 } 568 569 boolean matchesDigit() { 570 if (isEmpty()) return false; 571 return StringUtil.isDigit(charBuf[bufPos]); 572 } 573 574 boolean matchConsume(String seq) { 575 bufferUp(); 576 if (matches(seq)) { 577 bufPos += seq.length(); 578 return true; 579 } else { 580 return false; 581 } 582 } 583 584 boolean matchConsumeIgnoreCase(String seq) { 585 if (matchesIgnoreCase(seq)) { 586 bufPos += seq.length(); 587 return true; 588 } else { 589 return false; 590 } 591 } 592 593 // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans. 594 // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p 595 // looking for the </title>. Resets in bufferUp() 596 @Nullable private String lastIcSeq; // scan cache 597 private int lastIcIndex; // nearest found indexOf 598 599 /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */ 600 boolean containsIgnoreCase(String seq) { 601 if (seq.equals(lastIcSeq)) { 602 if (lastIcIndex == -1) return false; 603 if (lastIcIndex >= bufPos) return true; 604 } 605 lastIcSeq = seq; 606 607 String loScan = seq.toLowerCase(Locale.ENGLISH); 608 int lo = nextIndexOf(loScan); 609 if (lo > -1) { 610 lastIcIndex = bufPos + lo; return true; 611 } 612 613 String hiScan = seq.toUpperCase(Locale.ENGLISH); 614 int hi = nextIndexOf(hiScan); 615 boolean found = hi > -1; 616 lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains 617 return found; 618 } 619 620 @Override 621 public String toString() { 622 if (bufLength - bufPos < 0) return ""; 623 return new String(charBuf, bufPos, bufLength - bufPos); 624 } 625 626 /** 627 * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. 628 * <p /> 629 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 630 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 631 * some more duplicates. 632 */ 633 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 634 if (count > MaxStringCacheLen) // don't cache strings that are too big 635 return new String(charBuf, start, count); 636 if (count < 1) 637 return ""; 638 639 // calculate hash: 640 int hash = 0; 641 int end = count + start; 642 for (int i = start; i < end; i++) { 643 hash = 31 * hash + charBuf[i]; 644 } 645 646 // get from cache 647 final int index = hash & StringCacheSize - 1; 648 String cached = stringCache[index]; 649 650 if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit 651 return cached; 652 else { 653 cached = new String(charBuf, start, count); 654 stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next 655 } 656 657 return cached; 658 } 659 660 /** 661 * Check if the value of the provided range equals the string. 662 */ 663 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 664 if (count == cached.length()) { 665 int i = start; 666 int j = 0; 667 while (count-- != 0) { 668 if (charBuf[i++] != cached.charAt(j++)) 669 return false; 670 } 671 return true; 672 } 673 return false; 674 } 675 676 // just used for testing 677 boolean rangeEquals(final int start, final int count, final String cached) { 678 return rangeEquals(charBuf, start, count, cached); 679 } 680 681 @FunctionalInterface 682 interface CharPredicate { 683 boolean test(char c); 684 } 685}