001package org.jsoup.select;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.parser.TokenQueue;
006import org.jspecify.annotations.Nullable;
007
008import java.util.function.Function;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
013import static org.jsoup.internal.Normalizer.normalize;
014
015/**
016 * Parses a CSS selector into an Evaluator tree.
017 */
018public class QueryParser {
019    private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly
020    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
021    private final static char[] SequenceEnders = {',', ')'};
022
023    private final TokenQueue tq;
024    private final String query;
025
026    /**
027     * Create a new QueryParser.
028     * @param query CSS query
029     */
030    private QueryParser(String query) {
031        Validate.notEmpty(query);
032        query = query.trim();
033        this.query = query;
034        this.tq = new TokenQueue(query);
035    }
036
037    /**
038     Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
039     parse it once and reuse the Evaluator.
040
041     @param query CSS query
042     @return Evaluator
043     @see Selector selector query syntax
044     */
045    public static Evaluator parse(String query) {
046        try {
047            QueryParser p = new QueryParser(query);
048            return p.parse();
049        } catch (IllegalArgumentException e) {
050            throw new Selector.SelectorParseException(e.getMessage());
051        }
052    }
053
054    /**
055     Parse the query. We use this simplified expression of the grammar:
056     <pre>
057     SelectorGroup   ::= Selector (',' Selector)*
058     Selector        ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
059     SimpleSequence  ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )*
060     Pseudo           ::= ':' Name [ '(' SelectorGroup ')' ]
061     Combinator      ::= S+         // descendant (whitespace)
062     | '>'       // child
063     | '+'       // adjacent sibling
064     | '~'       // general sibling
065     </pre>
066
067     See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing
068     */
069    Evaluator parse() {
070        Evaluator eval = parseSelectorGroup();
071        tq.consumeWhitespace();
072        if (!tq.isEmpty())
073            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
074        return eval;
075    }
076
077    Evaluator parseSelectorGroup() {
078        // SelectorGroup. Into an Or if > 1 Selector
079        Evaluator left = parseSelector();
080        while (tq.matchChomp(',')) {
081            Evaluator right = parseSelector();
082            left = or(left, right);
083        }
084        return left;
085    }
086
087    Evaluator parseSelector() {
088        // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )*
089        tq.consumeWhitespace();
090
091        Evaluator left;
092        if (tq.matchesAny(Combinators)) {
093            // e.g. query is "> div"; left side is root element
094            left = new StructuralEvaluator.Root();
095        } else {
096            left = parseSimpleSequence();
097        }
098
099        while (true) {
100            char combinator = 0;
101            if (tq.consumeWhitespace())
102                combinator = ' ';            // maybe descendant?
103            if (tq.matchesAny(Combinators)) // no, explicit
104                combinator = tq.consume();
105            else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has()
106                break;
107
108            if (combinator != 0) {
109                Evaluator right = parseSimpleSequence();
110                left = combinator(left, combinator, right);
111            } else {
112                break;
113            }
114        }
115        return left;
116    }
117
118    Evaluator parseSimpleSequence() {
119        // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )*
120        Evaluator left = null;
121        tq.consumeWhitespace();
122
123        // one optional type selector
124        if (tq.matchesWord() || tq.matches("*|"))
125            left = byTag();
126        else if (tq.matchChomp('*'))
127            left = new Evaluator.AllElements();
128
129        // zero or more subclasses (#, ., [)
130        while(true) {
131            Evaluator right = parseSubclass();
132            if (right != null)
133                left = and(left, right);
134            else break; // no more simple tokens
135        }
136
137        if (left == null)
138            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
139        return left;
140    }
141
142    static Evaluator combinator(Evaluator left, char combinator, Evaluator right) {
143        switch (combinator) {
144            case '>':
145                ImmediateParentRun run = left instanceof ImmediateParentRun ?
146                    (ImmediateParentRun) left : new ImmediateParentRun(left);
147                run.add(right);
148                return run;
149            case ' ':
150                return and(new StructuralEvaluator.Ancestor(left), right);
151            case '+':
152                return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right);
153            case '~':
154                return and(new StructuralEvaluator.PreviousSibling(left), right);
155            default:
156                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
157        }
158    }
159
160    @Nullable Evaluator parseSubclass() {
161        //  Subclass: ID | Class | Attribute | Pseudo
162        if      (tq.matchChomp('#'))    return byId();
163        else if (tq.matchChomp('.'))    return byClass();
164        else if (tq.matches('['))       return byAttribute();
165        else if (tq.matchChomp(':'))    return parsePseudoSelector();
166        else                            return null;
167    }
168
169    /** Merge two evals into an Or. */
170    static Evaluator or(Evaluator left, Evaluator right) {
171        if (left instanceof CombiningEvaluator.Or) {
172            ((CombiningEvaluator.Or) left).add(right);
173            return left;
174        }
175        return new CombiningEvaluator.Or(left, right);
176    }
177
178    /** Merge two evals into an And. */
179    static Evaluator and(@Nullable Evaluator left, Evaluator right) {
180        if (left == null) return right;
181        if (left instanceof CombiningEvaluator.And) {
182            ((CombiningEvaluator.And) left).add(right);
183            return left;
184        }
185        return new CombiningEvaluator.And(left, right);
186    }
187
188    private Evaluator parsePseudoSelector() {
189        final String pseudo = tq.consumeCssIdentifier();
190        switch (pseudo) {
191            case "lt":
192                return new Evaluator.IndexLessThan(consumeIndex());
193            case "gt":
194                return new Evaluator.IndexGreaterThan(consumeIndex());
195            case "eq":
196                return new Evaluator.IndexEquals(consumeIndex());
197            case "has":
198                return has();
199            case "is":
200                return is();
201            case "contains":
202                return contains(false);
203            case "containsOwn":
204                return contains(true);
205            case "containsWholeText":
206                return containsWholeText(false);
207            case "containsWholeOwnText":
208                return containsWholeText(true);
209            case "containsData":
210                return containsData();
211            case "matches":
212                return matches(false);
213            case "matchesOwn":
214                return matches(true);
215            case "matchesWholeText":
216                return matchesWholeText(false);
217            case "matchesWholeOwnText":
218                return matchesWholeText(true);
219            case "not":
220                return not();
221            case "nth-child":
222                return cssNthChild(false, false);
223            case "nth-last-child":
224                return cssNthChild(true, false);
225            case "nth-of-type":
226                return cssNthChild(false, true);
227            case "nth-last-of-type":
228                return cssNthChild(true, true);
229            case "first-child":
230                return new Evaluator.IsFirstChild();
231            case "last-child":
232                return new Evaluator.IsLastChild();
233            case "first-of-type":
234                return new Evaluator.IsFirstOfType();
235            case "last-of-type":
236                return new Evaluator.IsLastOfType();
237            case "only-child":
238                return new Evaluator.IsOnlyChild();
239            case "only-of-type":
240                return new Evaluator.IsOnlyOfType();
241            case "empty":
242                return new Evaluator.IsEmpty();
243            case "root":
244                return new Evaluator.IsRoot();
245            case "matchText":
246                return new Evaluator.MatchText();
247            default:
248                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
249        }
250    }
251
252    private Evaluator byId() {
253        String id = tq.consumeCssIdentifier();
254        Validate.notEmpty(id);
255        return new Evaluator.Id(id);
256    }
257
258    private Evaluator byClass() {
259        String className = tq.consumeCssIdentifier();
260        Validate.notEmpty(className);
261        return new Evaluator.Class(className.trim());
262    }
263
264    private Evaluator byTag() {
265        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
266        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
267        // consistency - both the selector and the element tag
268        String tagName = normalize(tq.consumeElementSelector());
269        Validate.notEmpty(tagName);
270
271        // namespaces:
272        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
273            String plainTag = tagName.substring(2); // strip *|
274            return new CombiningEvaluator.Or(
275                new Evaluator.Tag(plainTag),
276                new Evaluator.TagEndsWith(":" + plainTag)
277            );
278        } else if (tagName.endsWith("|*")) { // ns|*
279            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
280            return new Evaluator.TagStartsWith(ns);
281        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
282            tagName = tagName.replace("|", ":");
283        }
284
285        return new Evaluator.Tag(tagName);
286    }
287
288    private Evaluator byAttribute() {
289        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
290        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
291        Validate.notEmpty(key);
292        cq.consumeWhitespace();
293        final Evaluator eval;
294
295        if (cq.isEmpty()) {
296            if (key.startsWith("^"))
297                eval = new Evaluator.AttributeStarting(key.substring(1));
298            else if (key.equals("*")) // any attribute
299                eval = new Evaluator.AttributeStarting("");
300            else
301                eval = new Evaluator.Attribute(key);
302        } else {
303            if (cq.matchChomp('='))
304                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
305            else if (cq.matchChomp("!="))
306                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
307            else if (cq.matchChomp("^="))
308                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
309            else if (cq.matchChomp("$="))
310                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
311            else if (cq.matchChomp("*="))
312                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
313            else if (cq.matchChomp("~="))
314                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
315            else
316                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
317        }
318        return eval;
319    }
320
321    //pseudo selectors :first-child, :last-child, :nth-child, ...
322    private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
323    private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");
324
325    private Evaluator cssNthChild(boolean last, boolean ofType) {
326        String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
327        final int step, offset;
328        if ("odd".equals(arg)) {
329            step = 2;
330            offset = 1;
331        } else if ("even".equals(arg)) {
332            step = 2;
333            offset = 0;
334        } else {
335            Matcher stepOffsetM, stepM;
336            if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
337                if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
338                    step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
339                else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1;
340                    step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
341                offset =
342                    stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
343            } else if ((stepM = NthOffset.matcher(arg)).matches()) {
344                step = 0;
345                offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
346            } else {
347                throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
348            }
349        }
350
351        return ofType
352            ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
353            : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
354    }
355
356    private String consumeParens() {
357        return tq.chompBalanced('(', ')');
358    }
359
360    private int consumeIndex() {
361        String index = consumeParens().trim();
362        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
363        return Integer.parseInt(index);
364    }
365
366    // pseudo selector :has(el)
367    private Evaluator has() {
368        return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector");
369    }
370
371    // pseudo selector :is()
372    private Evaluator is() {
373        return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector");
374    }
375
376    private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) {
377        Validate.isTrue(tq.matchChomp('('), err);
378        Evaluator eval = parseSelectorGroup();
379        Validate.isTrue(tq.matchChomp(')'), err);
380        return func.apply(eval);
381    }
382
383    // pseudo selector :contains(text), containsOwn(text)
384    private Evaluator contains(boolean own) {
385        String query = own ? ":containsOwn" : ":contains";
386        String searchText = TokenQueue.unescape(consumeParens());
387        Validate.notEmpty(searchText, query + "(text) query must not be empty");
388        return own
389            ? new Evaluator.ContainsOwnText(searchText)
390            : new Evaluator.ContainsText(searchText);
391    }
392
393    private Evaluator containsWholeText(boolean own) {
394        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
395        String searchText = TokenQueue.unescape(consumeParens());
396        Validate.notEmpty(searchText, query + "(text) query must not be empty");
397        return own
398            ? new Evaluator.ContainsWholeOwnText(searchText)
399            : new Evaluator.ContainsWholeText(searchText);
400    }
401
402    // pseudo selector :containsData(data)
403    private Evaluator containsData() {
404        String searchText = TokenQueue.unescape(consumeParens());
405        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
406        return new Evaluator.ContainsData(searchText);
407    }
408
409    // :matches(regex), matchesOwn(regex)
410    private Evaluator matches(boolean own) {
411        String query = own ? ":matchesOwn" : ":matches";
412        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
413        Validate.notEmpty(regex, query + "(regex) query must not be empty");
414
415        return own
416            ? new Evaluator.MatchesOwn(Pattern.compile(regex))
417            : new Evaluator.Matches(Pattern.compile(regex));
418    }
419
420    // :matches(regex), matchesOwn(regex)
421    private Evaluator matchesWholeText(boolean own) {
422        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
423        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
424        Validate.notEmpty(regex, query + "(regex) query must not be empty");
425
426        return own
427            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
428            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
429    }
430
431    // :not(selector)
432    private Evaluator not() {
433        String subQuery = consumeParens();
434        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
435
436        return new StructuralEvaluator.Not(parse(subQuery));
437    }
438
439    @Override
440    public String toString() {
441        return query;
442    }
443}