001package org.jsoup.parser;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005
006/**
007 A character reader with helpers focusing on parsing CSS selectors. Used internally by jsoup. API subject to changes.
008 */
009
010public class TokenQueue {
011    private static final char Esc = '\\'; // escape char for chomp balanced.
012    private static final char Hyphen_Minus = '-';
013    private static final char Unicode_Null = '\u0000';
014    private static final char Replacement = '\uFFFD';
015
016    private final CharacterReader reader;
017
018    /**
019     Create a new TokenQueue.
020     @param data string of data to back queue.
021     */
022    public TokenQueue(String data) {
023        reader = new CharacterReader(data);
024    }
025
026    /**
027     Is the queue empty?
028     @return true if no data left in queue.
029     */
030    public boolean isEmpty() {
031        return reader.isEmpty();
032    }
033
034    /**
035     Consume one character off queue.
036     @return first character on queue.
037     */
038    public char consume() {
039        return reader.consume();
040    }
041
042    /**
043     Drops the next character off the queue.
044     */
045    public void advance() {
046        if (!isEmpty()) reader.advance();
047    }
048
049    char current() {
050        return reader.current();
051    }
052
053    /**
054     Internal method, no longer supported.
055     @deprecated will be removed in 1.21.1.
056     */
057    @Deprecated public void addFirst(String seq) {
058        // only left in for API compat; could not find any public uses
059        // not very performant, but an edge case
060        throw new UnsupportedOperationException("addFirst() not supported");
061    }
062
063    /**
064     Tests if the next characters on the queue match the sequence, case-insensitively.
065     @param seq String to check queue for.
066     @return true if the next characters match.
067     */
068    public boolean matches(String seq) {
069        return reader.matchesIgnoreCase(seq);
070    }
071
072    /** Tests if the next character on the queue matches the character, case-sensitively. */
073    public boolean matches(char c) {
074        return reader.matches(c);
075    }
076
077    /**
078     @deprecated will be removed in 1.21.1.
079     */
080    @Deprecated public boolean matchesAny(String... seq) {
081        for (String s : seq) {
082            if (matches(s))
083                return true;
084        }
085        return false;
086    }
087
088    /**
089     Tests if the next characters match any of the sequences, case-<b>sensitively</b>.
090     @param seq list of chars to case-sensitively check for
091     @return true of any matched, false if none did
092     */
093    public boolean matchesAny(char... seq) {
094        return reader.matchesAny(seq);
095    }
096
097    /**
098     If the queue case-insensitively matches the supplied string, consume it off the queue.
099     @param seq String to search for, and if found, remove from queue.
100     @return true if found and removed, false if not found.
101     */
102    public boolean matchChomp(String seq) {
103        return reader.matchConsumeIgnoreCase(seq);
104    }
105
106    /** If the queue matches the supplied (case-sensitive) character, consume it off the queue. */
107    public boolean matchChomp(char c) {
108        if (reader.matches(c)) {
109            consume();
110            return true;
111        }
112        return false;
113    }
114
115    /**
116     Tests if queue starts with a whitespace character.
117     @return if starts with whitespace
118     */
119    public boolean matchesWhitespace() {
120        return StringUtil.isWhitespace(reader.current());
121    }
122
123    /**
124     Test if the queue matches a tag word character (letter or digit).
125     @return if matches a word character
126     */
127    public boolean matchesWord() {
128        return Character.isLetterOrDigit(reader.current());
129    }
130
131    /**
132     Consumes the supplied sequence of the queue, case-insensitively. If the queue does not start with the supplied
133     sequence, will throw an illegal state exception -- but you should be running match() against that condition.
134
135     @param seq sequence to remove from head of queue.
136     */
137    public void consume(String seq) {
138        boolean found = reader.matchConsumeIgnoreCase(seq);
139        if (!found) throw new IllegalStateException("Queue did not match expected sequence");
140    }
141
142    /**
143     Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
144     @param seq String to end on (and not include in return, but leave on queue). <b>Case-sensitive.</b>
145     @return The matched data consumed from queue.
146     */
147    public String consumeTo(String seq) {
148        return reader.consumeTo(seq);
149    }
150
151    /*
152     @deprecated will be removed in 1.21.1
153     */
154    @Deprecated public String consumeToIgnoreCase(String seq) {
155        StringBuilder sb = StringUtil.borrowBuilder();
156        while (!isEmpty() && !reader.matchesIgnoreCase(seq)) {
157            sb.append(consume());
158        }
159        return StringUtil.releaseBuilder(sb);
160    }
161
162    /**
163     Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
164     @param seq any number of terminators to consume to. <b>Case-insensitive.</b>
165     @return consumed string
166     */
167    public String consumeToAny(String... seq) {
168        StringBuilder sb = StringUtil.borrowBuilder();
169        OUT: while (!isEmpty()) {
170            for (String s : seq) {
171                if (reader.matchesIgnoreCase(s)) break OUT;
172            }
173            sb.append(consume());
174        }
175        return StringUtil.releaseBuilder(sb);
176    }
177
178    /**
179     * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
180     * <p>
181     * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
182     * isEmpty() == true).
183     * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case-sensitive.</b>
184     * @return Data matched from queue.
185     * @deprecated will be removed in 1.21.1
186     */
187    @Deprecated public String chompTo(String seq) {
188        String data = reader.consumeTo(seq);
189        matchChomp(seq);
190        return data;
191    }
192
193    /**
194     @deprecated will be removed in 1.21.1.
195     */
196    @Deprecated public String chompToIgnoreCase(String seq) {
197        String data = consumeToIgnoreCase(seq); // case insensitive scan
198        matchChomp(seq);
199        return data;
200    }
201
202    /**
203     Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
204     and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \).
205     Those escapes will be left in the returned string, which is suitable for regexes (where we need to preserve the
206     escape), but unsuitable for contains text strings; use unescape for that.
207
208     @param open opener
209     @param close closer
210     @return data matched from the queue
211     */
212    public String chompBalanced(char open, char close) {
213        StringBuilder accum = StringUtil.borrowBuilder();
214        int depth = 0;
215        char last = 0;
216        boolean inSingleQuote = false;
217        boolean inDoubleQuote = false;
218        boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote()
219        reader.mark(); // mark the initial position to restore if needed
220
221        do {
222            if (isEmpty()) break;
223            char c = consume();
224            if (last != Esc) {
225                if (c == '\'' && c != open && !inDoubleQuote)
226                    inSingleQuote = !inSingleQuote;
227                else if (c == '"' && c != open && !inSingleQuote)
228                    inDoubleQuote = !inDoubleQuote;
229                if (inSingleQuote || inDoubleQuote || inRegexQE) {
230                    accum.append(c);
231                    last = c;
232                    continue;
233                }
234
235                if (c == open) {
236                    depth++;
237                    if (depth > 1) accum.append(c); // don't include the outer match pair in the return
238                }
239                else if (c == close) {
240                    depth--;
241                    if (depth > 0) accum.append(c); // don't include the outer match pair in the return
242                } else {
243                    accum.append(c);
244                }
245            } else if (c == 'Q') {
246                inRegexQE = true;
247                accum.append(c);
248            } else if (c == 'E') {
249                inRegexQE = false;
250                accum.append(c);
251            } else {
252                accum.append(c);
253            }
254
255            last = c;
256        } while (depth > 0);
257
258        String out = StringUtil.releaseBuilder(accum);
259        if (depth > 0) {// ran out of queue before seeing enough )
260            reader.rewindToMark(); // restore position if we don't have a balanced string
261            Validate.fail("Did not find balanced marker at '" + out + "'");
262        }
263        return out;
264    }
265    
266    /**
267     * Unescape a \ escaped string.
268     * @param in backslash escaped string
269     * @return unescaped string
270     */
271    public static String unescape(String in) {
272        if (in.indexOf(Esc) == -1) return in;
273
274        StringBuilder out = StringUtil.borrowBuilder();
275        char last = 0;
276        for (char c : in.toCharArray()) {
277            if (c == Esc) {
278                if (last == Esc) {
279                    out.append(c);
280                    c = 0;
281                }
282            }
283            else 
284                out.append(c);
285            last = c;
286        }
287        return StringUtil.releaseBuilder(out);
288    }
289
290    /**
291     Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
292     valid in a selector.
293
294     @see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a>
295     */
296    public static String escapeCssIdentifier(String in) {
297        if (in.isEmpty()) return in;
298
299        StringBuilder out = StringUtil.borrowBuilder();
300        TokenQueue q = new TokenQueue(in);
301
302        char firstChar = q.current();
303        if (firstChar == Hyphen_Minus) {
304            q.advance();
305            if (q.isEmpty()) {
306                // If the character is the first character and is a "-" (U+002D), and there is no second character, then
307                // the escaped character.
308                appendEscaped(out, Hyphen_Minus);
309            } else {
310                out.append(Hyphen_Minus);
311
312                char secondChar = q.current();
313                if (StringUtil.isDigit(secondChar)) {
314                    // If the character is the second character and is in the range [0-9] (U+0030 to U+0039) and the
315                    // first character is a "-" (U+002D), then the character escaped as code point.
316                    appendEscapedCodepoint(out, q.consume());
317                }
318            }
319        } else if (StringUtil.isDigit(firstChar)) {
320            // If the character is the first character and is in the range [0-9] (U+0030 to U+0039), then the character
321            // escaped as code point.
322            appendEscapedCodepoint(out, q.consume());
323        }
324
325        while (!q.isEmpty()) {
326            // Note: It's fine to iterate on chars because non-ASCII characters are never escaped. So surrogate pairs
327            // are kept intact.
328            char c = q.consume();
329            if (c == Unicode_Null) {
330                // If the character is NULL (U+0000), then the REPLACEMENT CHARACTER (U+FFFD).
331                out.append(Replacement);
332            } else if (c <= '\u001F' || c == '\u007F') {
333                // If the character is in the range [\1-\1f] (U+0001 to U+001F) or is U+007F, then the character
334                // escaped as code point.
335                appendEscapedCodepoint(out, c);
336            } else if (isIdent(c)) {
337                // If the character is not handled by one of the above rules and is greater than or equal to U+0080,
338                // is "-" (U+002D) or "_" (U+005F), or is in one of the ranges [0-9] (U+0030 to U+0039),
339                // [A-Z] (U+0041 to U+005A), or [a-z] (U+0061 to U+007A), then the character itself.
340                out.append(c);
341            } else {
342                // Otherwise, the escaped character.
343                appendEscaped(out, c);
344            }
345        }
346
347        return StringUtil.releaseBuilder(out);
348    }
349
350    private static void appendEscaped(StringBuilder out, char c) {
351        out.append(Esc).append(c);
352    }
353
354    private static void appendEscapedCodepoint(StringBuilder out, char c) {
355        out.append(Esc).append(Integer.toHexString(c)).append(' ');
356    }
357
358    /**
359     * Pulls the next run of whitespace characters of the queue.
360     * @return Whether consuming whitespace or not
361     */
362    public boolean consumeWhitespace() {
363        boolean seen = false;
364        while (matchesWhitespace()) {
365            advance();
366            seen = true;
367        }
368        return seen;
369    }
370
371    /**
372     * Retrieves the next run of word type (letter or digit) off the queue.
373     * @return String of word characters from queue, or empty string if none.
374     @deprecated will be removed in 1.21.1
375     */
376    @Deprecated public String consumeWord() {
377        return reader.consumeMatching(Character::isLetterOrDigit);
378    }
379
380    /**
381     * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
382     * 
383     * @return tag name
384     */
385    public String consumeElementSelector() {
386        return consumeEscapedCssIdentifier(ElementSelectorChars);
387    }
388    private static final char[] ElementSelectorChars = {'*', '|', '_', '-'};
389
390    /**
391     Consume a CSS identifier (ID or class) off the queue.
392     <p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead
393     of {@code \31}.</p>
394
395     @return The unescaped identifier.
396     @throws IllegalArgumentException if an invalid escape sequence was found. Afterward, the state of the TokenQueue
397     is undefined.
398     @see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a>
399     @see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a>
400     */
401    public String consumeCssIdentifier() {
402        if (isEmpty()) throw new IllegalArgumentException("CSS identifier expected, but end of input found");
403
404        // Fast path for CSS identifiers that don't contain escape sequences.
405        String identifier = reader.consumeMatching(TokenQueue::isIdent);
406        char c = current();
407        if (c != Esc && c != Unicode_Null) {
408            // If we didn't end on an Esc or a Null, we consumed the whole identifier
409            return identifier;
410        }
411
412        // An escape sequence was found. Use a StringBuilder to store the decoded CSS identifier.
413        StringBuilder out = StringUtil.borrowBuilder();
414        if (!identifier.isEmpty()) {
415            // Copy the CSS identifier up to the first escape sequence.
416            out.append(identifier);
417        }
418
419        while (!isEmpty()) {
420            c = current();
421            if (isIdent(c)) {
422                out.append(consume());
423            } else if (c == Unicode_Null) {
424                // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
425                advance();
426                out.append(Replacement);
427            } else if (c == Esc) {
428                advance();
429                if (!isEmpty() && isNewline(current())) {
430                    // Not a valid escape sequence. This is treated as the end of the CSS identifier.
431                    reader.unconsume();
432                    break;
433                } else {
434                    consumeCssEscapeSequenceInto(out);
435                }
436            } else {
437                break;
438            }
439        }
440        return StringUtil.releaseBuilder(out);
441    }
442
443    private void consumeCssEscapeSequenceInto(StringBuilder out) {
444        if (isEmpty()) {
445            out.append(Replacement);
446            return;
447        }
448
449        char firstEscaped = consume();
450        if (!StringUtil.isHexDigit(firstEscaped)) {
451            out.append(firstEscaped);
452        } else {
453            reader.unconsume(); // put back the first hex digit
454            String hexString = reader.consumeMatching(StringUtil::isHexDigit, 6); // consume up to 6 hex digits
455            int codePoint;
456            try {
457                codePoint = Integer.parseInt(hexString, 16);
458            } catch (NumberFormatException e) {
459                throw new IllegalArgumentException("Invalid escape sequence: " + hexString, e);
460            }
461            if (isValidCodePoint(codePoint)) {
462                out.appendCodePoint(codePoint);
463            } else {
464                out.append(Replacement);
465            }
466
467            if (!isEmpty()) {
468                char c = current();
469                if (c == '\r') {
470                    // Since there's currently no input preprocessing, check for CRLF here.
471                    // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
472                    advance();
473                    if (!isEmpty() && current() == '\n') advance();
474                } else if (c == ' ' || c == '\t' || isNewline(c)) {
475                    advance();
476                }
477            }
478        }
479    }
480
481    // statics below specifically for CSS identifiers:
482
483    // https://www.w3.org/TR/css-syntax-3/#non-ascii-code-point
484    private static boolean isNonAscii(char c) {
485        return c >= '\u0080';
486    }
487
488    // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
489    private static boolean isIdentStart(char c) {
490        return c == '_' || StringUtil.isAsciiLetter(c) || isNonAscii(c);
491    }
492
493    // https://www.w3.org/TR/css-syntax-3/#ident-code-point
494    private static boolean isIdent(char c) {
495        return c == Hyphen_Minus || StringUtil.isDigit(c) || isIdentStart(c);
496    }
497
498    // https://www.w3.org/TR/css-syntax-3/#newline
499    // Note: currently there's no preprocessing happening.
500    private static boolean isNewline(char c) {
501        return c == '\n' || c == '\r' || c == '\f';
502    }
503
504    // https://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
505    private static boolean isValidCodePoint(int codePoint) {
506        return codePoint != 0 && Character.isValidCodePoint(codePoint) && !Character.isSurrogate((char) codePoint);
507    }
508
509    private static final char[] CssIdentifierChars = {'-', '_'};
510
511    private String consumeEscapedCssIdentifier(char... matches) {
512        StringBuilder sb = StringUtil.borrowBuilder();
513        while (!isEmpty()) {
514            char c = current();
515            if (c == Esc) {
516                advance();
517                if (!isEmpty()) sb.append(consume());
518                else break;
519            } else if (matchesCssIdentifier(matches)) {
520                sb.append(c);
521                advance();
522            } else {
523                break;
524            }
525        }
526        return StringUtil.releaseBuilder(sb);
527    }
528
529    private boolean matchesCssIdentifier(char... matches) {
530        return matchesWord() || reader.matchesAny(matches);
531    }
532
533    /**
534     Consume and return whatever is left on the queue.
535     @return remainder of queue.
536     */
537    public String remainder() {
538        return reader.consumeToEnd();
539    }
540
541    @Override
542    public String toString() {
543        return reader.toString();
544    }
545}