001package org.jsoup.parser; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005 006/** 007 A character reader with helpers focusing on parsing CSS selectors. Used internally by jsoup. API subject to changes. 008 */ 009 010public class TokenQueue { 011 private static final char Esc = '\\'; // escape char for chomp balanced. 012 private static final char Hyphen_Minus = '-'; 013 private static final char Unicode_Null = '\u0000'; 014 private static final char Replacement = '\uFFFD'; 015 016 private final CharacterReader reader; 017 018 /** 019 Create a new TokenQueue. 020 @param data string of data to back queue. 021 */ 022 public TokenQueue(String data) { 023 reader = new CharacterReader(data); 024 } 025 026 /** 027 Is the queue empty? 028 @return true if no data left in queue. 029 */ 030 public boolean isEmpty() { 031 return reader.isEmpty(); 032 } 033 034 /** 035 Consume one character off queue. 036 @return first character on queue. 037 */ 038 public char consume() { 039 return reader.consume(); 040 } 041 042 /** 043 Drops the next character off the queue. 044 */ 045 public void advance() { 046 if (!isEmpty()) reader.advance(); 047 } 048 049 char current() { 050 return reader.current(); 051 } 052 053 /** 054 Internal method, no longer supported. 055 @deprecated will be removed in 1.21.1. 056 */ 057 @Deprecated public void addFirst(String seq) { 058 // only left in for API compat; could not find any public uses 059 // not very performant, but an edge case 060 throw new UnsupportedOperationException("addFirst() not supported"); 061 } 062 063 /** 064 Tests if the next characters on the queue match the sequence, case-insensitively. 065 @param seq String to check queue for. 066 @return true if the next characters match. 067 */ 068 public boolean matches(String seq) { 069 return reader.matchesIgnoreCase(seq); 070 } 071 072 /** Tests if the next character on the queue matches the character, case-sensitively. */ 073 public boolean matches(char c) { 074 return reader.matches(c); 075 } 076 077 /** 078 @deprecated will be removed in 1.21.1. 079 */ 080 @Deprecated public boolean matchesAny(String... seq) { 081 for (String s : seq) { 082 if (matches(s)) 083 return true; 084 } 085 return false; 086 } 087 088 /** 089 Tests if the next characters match any of the sequences, case-<b>sensitively</b>. 090 @param seq list of chars to case-sensitively check for 091 @return true of any matched, false if none did 092 */ 093 public boolean matchesAny(char... seq) { 094 return reader.matchesAny(seq); 095 } 096 097 /** 098 If the queue case-insensitively matches the supplied string, consume it off the queue. 099 @param seq String to search for, and if found, remove from queue. 100 @return true if found and removed, false if not found. 101 */ 102 public boolean matchChomp(String seq) { 103 return reader.matchConsumeIgnoreCase(seq); 104 } 105 106 /** If the queue matches the supplied (case-sensitive) character, consume it off the queue. */ 107 public boolean matchChomp(char c) { 108 if (reader.matches(c)) { 109 consume(); 110 return true; 111 } 112 return false; 113 } 114 115 /** 116 Tests if queue starts with a whitespace character. 117 @return if starts with whitespace 118 */ 119 public boolean matchesWhitespace() { 120 return StringUtil.isWhitespace(reader.current()); 121 } 122 123 /** 124 Test if the queue matches a tag word character (letter or digit). 125 @return if matches a word character 126 */ 127 public boolean matchesWord() { 128 return Character.isLetterOrDigit(reader.current()); 129 } 130 131 /** 132 Consumes the supplied sequence of the queue, case-insensitively. If the queue does not start with the supplied 133 sequence, will throw an illegal state exception -- but you should be running match() against that condition. 134 135 @param seq sequence to remove from head of queue. 136 */ 137 public void consume(String seq) { 138 boolean found = reader.matchConsumeIgnoreCase(seq); 139 if (!found) throw new IllegalStateException("Queue did not match expected sequence"); 140 } 141 142 /** 143 Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. 144 @param seq String to end on (and not include in return, but leave on queue). <b>Case-sensitive.</b> 145 @return The matched data consumed from queue. 146 */ 147 public String consumeTo(String seq) { 148 return reader.consumeTo(seq); 149 } 150 151 /* 152 @deprecated will be removed in 1.21.1 153 */ 154 @Deprecated public String consumeToIgnoreCase(String seq) { 155 StringBuilder sb = StringUtil.borrowBuilder(); 156 while (!isEmpty() && !reader.matchesIgnoreCase(seq)) { 157 sb.append(consume()); 158 } 159 return StringUtil.releaseBuilder(sb); 160 } 161 162 /** 163 Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. 164 @param seq any number of terminators to consume to. <b>Case-insensitive.</b> 165 @return consumed string 166 */ 167 public String consumeToAny(String... seq) { 168 StringBuilder sb = StringUtil.borrowBuilder(); 169 OUT: while (!isEmpty()) { 170 for (String s : seq) { 171 if (reader.matchesIgnoreCase(s)) break OUT; 172 } 173 sb.append(consume()); 174 } 175 return StringUtil.releaseBuilder(sb); 176 } 177 178 /** 179 * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). 180 * <p> 181 * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go 182 * isEmpty() == true). 183 * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case-sensitive.</b> 184 * @return Data matched from queue. 185 * @deprecated will be removed in 1.21.1 186 */ 187 @Deprecated public String chompTo(String seq) { 188 String data = reader.consumeTo(seq); 189 matchChomp(seq); 190 return data; 191 } 192 193 /** 194 @deprecated will be removed in 1.21.1. 195 */ 196 @Deprecated public String chompToIgnoreCase(String seq) { 197 String data = consumeToIgnoreCase(seq); // case insensitive scan 198 matchChomp(seq); 199 return data; 200 } 201 202 /** 203 Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", 204 and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). 205 Those escapes will be left in the returned string, which is suitable for regexes (where we need to preserve the 206 escape), but unsuitable for contains text strings; use unescape for that. 207 208 @param open opener 209 @param close closer 210 @return data matched from the queue 211 */ 212 public String chompBalanced(char open, char close) { 213 StringBuilder accum = StringUtil.borrowBuilder(); 214 int depth = 0; 215 char last = 0; 216 boolean inSingleQuote = false; 217 boolean inDoubleQuote = false; 218 boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote() 219 reader.mark(); // mark the initial position to restore if needed 220 221 do { 222 if (isEmpty()) break; 223 char c = consume(); 224 if (last != Esc) { 225 if (c == '\'' && c != open && !inDoubleQuote) 226 inSingleQuote = !inSingleQuote; 227 else if (c == '"' && c != open && !inSingleQuote) 228 inDoubleQuote = !inDoubleQuote; 229 if (inSingleQuote || inDoubleQuote || inRegexQE) { 230 accum.append(c); 231 last = c; 232 continue; 233 } 234 235 if (c == open) { 236 depth++; 237 if (depth > 1) accum.append(c); // don't include the outer match pair in the return 238 } 239 else if (c == close) { 240 depth--; 241 if (depth > 0) accum.append(c); // don't include the outer match pair in the return 242 } else { 243 accum.append(c); 244 } 245 } else if (c == 'Q') { 246 inRegexQE = true; 247 accum.append(c); 248 } else if (c == 'E') { 249 inRegexQE = false; 250 accum.append(c); 251 } else { 252 accum.append(c); 253 } 254 255 last = c; 256 } while (depth > 0); 257 258 String out = StringUtil.releaseBuilder(accum); 259 if (depth > 0) {// ran out of queue before seeing enough ) 260 reader.rewindToMark(); // restore position if we don't have a balanced string 261 Validate.fail("Did not find balanced marker at '" + out + "'"); 262 } 263 return out; 264 } 265 266 /** 267 * Unescape a \ escaped string. 268 * @param in backslash escaped string 269 * @return unescaped string 270 */ 271 public static String unescape(String in) { 272 if (in.indexOf(Esc) == -1) return in; 273 274 StringBuilder out = StringUtil.borrowBuilder(); 275 char last = 0; 276 for (char c : in.toCharArray()) { 277 if (c == Esc) { 278 if (last == Esc) { 279 out.append(c); 280 c = 0; 281 } 282 } 283 else 284 out.append(c); 285 last = c; 286 } 287 return StringUtil.releaseBuilder(out); 288 } 289 290 /** 291 Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be 292 valid in a selector. 293 294 @see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a> 295 */ 296 public static String escapeCssIdentifier(String in) { 297 if (in.isEmpty()) return in; 298 299 StringBuilder out = StringUtil.borrowBuilder(); 300 TokenQueue q = new TokenQueue(in); 301 302 char firstChar = q.current(); 303 if (firstChar == Hyphen_Minus) { 304 q.advance(); 305 if (q.isEmpty()) { 306 // If the character is the first character and is a "-" (U+002D), and there is no second character, then 307 // the escaped character. 308 appendEscaped(out, Hyphen_Minus); 309 } else { 310 out.append(Hyphen_Minus); 311 312 char secondChar = q.current(); 313 if (StringUtil.isDigit(secondChar)) { 314 // If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the 315 // first character is a "-" (U+002D), then the character escaped as code point. 316 appendEscapedCodepoint(out, q.consume()); 317 } 318 } 319 } else if (StringUtil.isDigit(firstChar)) { 320 // If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character 321 // escaped as code point. 322 appendEscapedCodepoint(out, q.consume()); 323 } 324 325 while (!q.isEmpty()) { 326 // Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs 327 // are kept intact. 328 char c = q.consume(); 329 if (c == Unicode_Null) { 330 // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD). 331 out.append(Replacement); 332 } else if (c <= '\u001F' || c == '\u007F') { 333 // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character 334 // escaped as code point. 335 appendEscapedCodepoint(out, c); 336 } else if (isIdent(c)) { 337 // If the character is not handled by one of the above rules and is greater than or equal to U+0080, 338 // is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039), 339 // [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself. 340 out.append(c); 341 } else { 342 // Otherwise, the escaped character. 343 appendEscaped(out, c); 344 } 345 } 346 347 return StringUtil.releaseBuilder(out); 348 } 349 350 private static void appendEscaped(StringBuilder out, char c) { 351 out.append(Esc).append(c); 352 } 353 354 private static void appendEscapedCodepoint(StringBuilder out, char c) { 355 out.append(Esc).append(Integer.toHexString(c)).append(' '); 356 } 357 358 /** 359 * Pulls the next run of whitespace characters of the queue. 360 * @return Whether consuming whitespace or not 361 */ 362 public boolean consumeWhitespace() { 363 boolean seen = false; 364 while (matchesWhitespace()) { 365 advance(); 366 seen = true; 367 } 368 return seen; 369 } 370 371 /** 372 * Retrieves the next run of word type (letter or digit) off the queue. 373 * @return String of word characters from queue, or empty string if none. 374 @deprecated will be removed in 1.21.1 375 */ 376 @Deprecated public String consumeWord() { 377 return reader.consumeMatching(Character::isLetterOrDigit); 378 } 379 380 /** 381 * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects). 382 * 383 * @return tag name 384 */ 385 public String consumeElementSelector() { 386 return consumeEscapedCssIdentifier(ElementSelectorChars); 387 } 388 private static final char[] ElementSelectorChars = {'*', '|', '_', '-'}; 389 390 /** 391 Consume a CSS identifier (ID or class) off the queue. 392 <p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead 393 of {@code \31}.</p> 394 395 @return The unescaped identifier. 396 @throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue 397 is undefined. 398 @see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a> 399 @see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a> 400 */ 401 public String consumeCssIdentifier() { 402 if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found"); 403 404 // Fast path for CSS identifiers that don't contain escape sequences. 405 String identifier = reader.consumeMatching(TokenQueue::isIdent); 406 char c = current(); 407 if (c != Esc && c != Unicode_Null) { 408 // If we didn't end on an Esc or a Null, we consumed the whole identifier 409 return identifier; 410 } 411 412 // An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier. 413 StringBuilder out = StringUtil.borrowBuilder(); 414 if (!identifier.isEmpty()) { 415 // Copy the CSS identifier up to the first escape sequence. 416 out.append(identifier); 417 } 418 419 while (!isEmpty()) { 420 c = current(); 421 if (isIdent(c)) { 422 out.append(consume()); 423 } else if (c == Unicode_Null) { 424 // https://www.w3.org/TR/css-syntax-3/#input-preprocessing 425 advance(); 426 out.append(Replacement); 427 } else if (c == Esc) { 428 advance(); 429 if (!isEmpty() && isNewline(current())) { 430 // Not a valid escape sequence. This is treated as the end of the CSS identifier. 431 reader.unconsume(); 432 break; 433 } else { 434 consumeCssEscapeSequenceInto(out); 435 } 436 } else { 437 break; 438 } 439 } 440 return StringUtil.releaseBuilder(out); 441 } 442 443 private void consumeCssEscapeSequenceInto(StringBuilder out) { 444 if (isEmpty()) { 445 out.append(Replacement); 446 return; 447 } 448 449 char firstEscaped = consume(); 450 if (!StringUtil.isHexDigit(firstEscaped)) { 451 out.append(firstEscaped); 452 } else { 453 reader.unconsume(); // put back the first hex digit 454 String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits 455 int codePoint; 456 try { 457 codePoint = Integer.parseInt(hexString, 16); 458 } catch (NumberFormatException e) { 459 throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e); 460 } 461 if (isValidCodePoint(codePoint)) { 462 out.appendCodePoint(codePoint); 463 } else { 464 out.append(Replacement); 465 } 466 467 if (!isEmpty()) { 468 char c = current(); 469 if (c == '\r') { 470 // Since there's currently no input preprocessing, check for CRLF here. 471 // https://www.w3.org/TR/css-syntax-3/#input-preprocessing 472 advance(); 473 if (!isEmpty() && current() == '\n') advance(); 474 } else if (c == ' ' || c == '\t' || isNewline(c)) { 475 advance(); 476 } 477 } 478 } 479 } 480 481 // statics below specifically for CSS identifiers: 482 483 // https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point 484 private static boolean isNonAscii(char c) { 485 return c >= '\u0080'; 486 } 487 488 // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point 489 private static boolean isIdentStart(char c) { 490 return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c); 491 } 492 493 // https://www.w3.org/TR/css-syntax-3/#ident-code-point 494 private static boolean isIdent(char c) { 495 return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c); 496 } 497 498 // https://www.w3.org/TR/css-syntax-3/#newline 499 // Note: currently there's no preprocessing happening. 500 private static boolean isNewline(char c) { 501 return c == '\n' || c == '\r' || c == '\f'; 502 } 503 504 // https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point 505 private static boolean isValidCodePoint(int codePoint) { 506 return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint); 507 } 508 509 private static final char[] CssIdentifierChars = {'-', '_'}; 510 511 private String consumeEscapedCssIdentifier(char... matches) { 512 StringBuilder sb = StringUtil.borrowBuilder(); 513 while (!isEmpty()) { 514 char c = current(); 515 if (c == Esc) { 516 advance(); 517 if (!isEmpty()) sb.append(consume()); 518 else break; 519 } else if (matchesCssIdentifier(matches)) { 520 sb.append(c); 521 advance(); 522 } else { 523 break; 524 } 525 } 526 return StringUtil.releaseBuilder(sb); 527 } 528 529 private boolean matchesCssIdentifier(char... matches) { 530 return matchesWord() || reader.matchesAny(matches); 531 } 532 533 /** 534 Consume and return whatever is left on the queue. 535 @return remainder of queue. 536 */ 537 public String remainder() { 538 return reader.consumeToEnd(); 539 } 540 541 @Override 542 public String toString() { 543 return reader.toString(); 544 } 545}