001package org.jsoup.select;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.parser.TokenQueue;
006
007import java.util.ArrayList;
008import java.util.List;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
013import static org.jsoup.internal.Normalizer.normalize;
014
015/**
016 * Parses a CSS selector into an Evaluator tree.
017 */
018public class QueryParser {
019    private final static char[] Combinators = {',', '>', '+', '~', ' '};
020    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
021
022    private final TokenQueue tq;
023    private final String query;
024    private final List<Evaluator> evals = new ArrayList<>();
025
026    /**
027     * Create a new QueryParser.
028     * @param query CSS query
029     */
030    private QueryParser(String query) {
031        Validate.notEmpty(query);
032        query = query.trim();
033        this.query = query;
034        this.tq = new TokenQueue(query);
035    }
036
037    /**
038     * Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to
039     * parse it once and reuse the Evaluator.
040     * @param query CSS query
041     * @return Evaluator
042     * @see Selector selector query syntax
043     */
044    public static Evaluator parse(String query) {
045        try {
046            QueryParser p = new QueryParser(query);
047            return p.parse();
048        } catch (IllegalArgumentException e) {
049            throw new Selector.SelectorParseException(e.getMessage());
050        }
051    }
052
053    /**
054     * Parse the query
055     * @return Evaluator
056     */
057    Evaluator parse() {
058        tq.consumeWhitespace();
059
060        if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements
061            evals.add(new StructuralEvaluator.Root());
062            combinator(tq.consume());
063        } else {
064            evals.add(consumeEvaluator());
065        }
066
067        while (!tq.isEmpty()) {
068            // hierarchy and extras
069            boolean seenWhite = tq.consumeWhitespace();
070
071            if (tq.matchesAny(Combinators)) {
072                combinator(tq.consume());
073            } else if (seenWhite) {
074                combinator(' ');
075            } else { // E.class, E#id, E[attr] etc. AND
076                evals.add(consumeEvaluator()); // take next el, #. etc off queue
077            }
078        }
079
080        if (evals.size() == 1)
081            return evals.get(0);
082
083        return new CombiningEvaluator.And(evals);
084    }
085
086    private void combinator(char combinator) {
087        tq.consumeWhitespace();
088        String subQuery = consumeSubQuery(); // support multi > childs
089
090        Evaluator rootEval; // the new topmost evaluator
091        Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or.
092        Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator
093        boolean replaceRightMost = false;
094
095        if (evals.size() == 1) {
096            rootEval = currentEval = evals.get(0);
097            // make sure OR (,) has precedence:
098            if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
099                currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator();
100                assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point
101                replaceRightMost = true;
102            }
103        }
104        else {
105            rootEval = currentEval = new CombiningEvaluator.And(evals);
106        }
107        evals.clear();
108
109        // for most combinators: change the current eval into an AND of the current eval and the new eval
110        switch (combinator) {
111            case '>':
112                ImmediateParentRun run = currentEval instanceof ImmediateParentRun ?
113                        (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval);
114                run.add(newEval);
115                currentEval = run;
116                break;
117            case ' ':
118                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Ancestor(currentEval), newEval);
119                break;
120            case '+':
121                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval);
122                break;
123            case '~':
124                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval);
125                break;
126            case ',':
127                CombiningEvaluator.Or or;
128                if (currentEval instanceof CombiningEvaluator.Or) {
129                    or = (CombiningEvaluator.Or) currentEval;
130                } else {
131                    or = new CombiningEvaluator.Or();
132                    or.add(currentEval);
133                }
134                or.add(newEval);
135                currentEval = or;
136                break;
137            default:
138                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
139        }
140
141        if (replaceRightMost)
142            ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval);
143        else rootEval = currentEval;
144        evals.add(rootEval);
145    }
146
147    private String consumeSubQuery() {
148        StringBuilder sq = StringUtil.borrowBuilder();
149        boolean seenClause = false; // eat until we hit a combinator after eating something else
150        while (!tq.isEmpty()) {
151            if (tq.matchesAny(Combinators)) {
152                if (seenClause)
153                    break;
154                sq.append(tq.consume());
155                continue;
156            }
157            seenClause = true;
158            if (tq.matches("("))
159                sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
160            else if (tq.matches("["))
161                sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
162            else if (tq.matches("\\")) { // bounce over escapes
163                sq.append(tq.consume());
164                if (!tq.isEmpty()) sq.append(tq.consume());
165            } else
166                sq.append(tq.consume());
167        }
168        return StringUtil.releaseBuilder(sq);
169    }
170
171    private Evaluator consumeEvaluator() {
172        if (tq.matchChomp("#"))
173            return byId();
174        else if (tq.matchChomp("."))
175            return byClass();
176        else if (tq.matchesWord() || tq.matches("*|"))
177            return byTag();
178        else if (tq.matches("["))
179            return byAttribute();
180        else if (tq.matchChomp("*"))
181            return new Evaluator.AllElements();
182        else if (tq.matchChomp(":"))
183            return parsePseudoSelector();
184                else // unhandled
185            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
186    }
187
188    private Evaluator parsePseudoSelector() {
189        final String pseudo = tq.consumeCssIdentifier();
190        switch (pseudo) {
191            case "lt":
192                return new Evaluator.IndexLessThan(consumeIndex());
193            case "gt":
194                return new Evaluator.IndexGreaterThan(consumeIndex());
195            case "eq":
196                return new Evaluator.IndexEquals(consumeIndex());
197            case "has":
198                return has();
199            case "is":
200                return is();
201            case "contains":
202                return contains(false);
203            case "containsOwn":
204                return contains(true);
205            case "containsWholeText":
206                return containsWholeText(false);
207            case "containsWholeOwnText":
208                return containsWholeText(true);
209            case "containsData":
210                return containsData();
211            case "matches":
212                return matches(false);
213            case "matchesOwn":
214                return matches(true);
215            case "matchesWholeText":
216                return matchesWholeText(false);
217            case "matchesWholeOwnText":
218                return matchesWholeText(true);
219            case "not":
220                return not();
221            case "nth-child":
222                return cssNthChild(false, false);
223            case "nth-last-child":
224                return cssNthChild(true, false);
225            case "nth-of-type":
226                return cssNthChild(false, true);
227            case "nth-last-of-type":
228                return cssNthChild(true, true);
229            case "first-child":
230                return new Evaluator.IsFirstChild();
231            case "last-child":
232                return new Evaluator.IsLastChild();
233            case "first-of-type":
234                return new Evaluator.IsFirstOfType();
235            case "last-of-type":
236                return new Evaluator.IsLastOfType();
237            case "only-child":
238                return new Evaluator.IsOnlyChild();
239            case "only-of-type":
240                return new Evaluator.IsOnlyOfType();
241            case "empty":
242                return new Evaluator.IsEmpty();
243            case "root":
244                return new Evaluator.IsRoot();
245            case "matchText":
246                return new Evaluator.MatchText();
247            default:
248                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
249        }
250    }
251
252    private Evaluator byId() {
253        String id = tq.consumeCssIdentifier();
254        Validate.notEmpty(id);
255        return new Evaluator.Id(id);
256    }
257
258    private Evaluator byClass() {
259        String className = tq.consumeCssIdentifier();
260        Validate.notEmpty(className);
261        return new Evaluator.Class(className.trim());
262    }
263
264    private Evaluator byTag() {
265        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
266        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
267        // consistency - both the selector and the element tag
268        String tagName = normalize(tq.consumeElementSelector());
269        Validate.notEmpty(tagName);
270
271        // namespaces:
272        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
273            String plainTag = tagName.substring(2); // strip *|
274            return new CombiningEvaluator.Or(
275                new Evaluator.Tag(plainTag),
276                new Evaluator.TagEndsWith(":" + plainTag)
277            );
278        } else if (tagName.endsWith("|*")) { // ns|*
279            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
280            return new Evaluator.TagStartsWith(ns);
281        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
282            tagName = tagName.replace("|", ":");
283        }
284
285        return new Evaluator.Tag(tagName);
286    }
287
288    private Evaluator byAttribute() {
289        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
290        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
291        Validate.notEmpty(key);
292        cq.consumeWhitespace();
293        final Evaluator eval;
294
295        if (cq.isEmpty()) {
296            if (key.startsWith("^"))
297                eval = new Evaluator.AttributeStarting(key.substring(1));
298            else if (key.equals("*")) // any attribute
299                eval = new Evaluator.AttributeStarting("");
300            else
301                eval = new Evaluator.Attribute(key);
302        } else {
303            if (cq.matchChomp("="))
304                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
305            else if (cq.matchChomp("!="))
306                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
307            else if (cq.matchChomp("^="))
308                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
309            else if (cq.matchChomp("$="))
310                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
311            else if (cq.matchChomp("*="))
312                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
313            else if (cq.matchChomp("~="))
314                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
315            else
316                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
317        }
318        return eval;
319    }
320
321    //pseudo selectors :first-child, :last-child, :nth-child, ...
322    private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
323    private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)");
324
325    private Evaluator cssNthChild(boolean last, boolean ofType) {
326        String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd)
327        final int step, offset;
328        if ("odd".equals(arg)) {
329            step = 2;
330            offset = 1;
331        } else if ("even".equals(arg)) {
332            step = 2;
333            offset = 0;
334        } else {
335            Matcher stepOffsetM, stepM;
336            if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) {
337                if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2
338                    step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", ""));
339                else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1;
340                    step = "-".equals(stepOffsetM.group(2)) ? -1 : 1;
341                offset =
342                    stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0;
343            } else if ((stepM = NthOffset.matcher(arg)).matches()) {
344                step = 0;
345                offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", ""));
346            } else {
347                throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
348            }
349        }
350
351        return ofType
352            ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset))
353            : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset));
354    }
355
356    private String consumeParens() {
357        return tq.chompBalanced('(', ')');
358    }
359
360    private int consumeIndex() {
361        String index = consumeParens().trim();
362        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
363        return Integer.parseInt(index);
364    }
365
366    // pseudo selector :has(el)
367    private Evaluator has() {
368        String subQuery = consumeParens();
369        Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty");
370        return new StructuralEvaluator.Has(parse(subQuery));
371    }
372
373    // psuedo selector :is()
374    private Evaluator is() {
375        String subQuery = consumeParens();
376        Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty");
377        return new StructuralEvaluator.Is(parse(subQuery));
378    }
379
380    // pseudo selector :contains(text), containsOwn(text)
381    private Evaluator contains(boolean own) {
382        String query = own ? ":containsOwn" : ":contains";
383        String searchText = TokenQueue.unescape(consumeParens());
384        Validate.notEmpty(searchText, query + "(text) query must not be empty");
385        return own
386            ? new Evaluator.ContainsOwnText(searchText)
387            : new Evaluator.ContainsText(searchText);
388    }
389
390    private Evaluator containsWholeText(boolean own) {
391        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
392        String searchText = TokenQueue.unescape(consumeParens());
393        Validate.notEmpty(searchText, query + "(text) query must not be empty");
394        return own
395            ? new Evaluator.ContainsWholeOwnText(searchText)
396            : new Evaluator.ContainsWholeText(searchText);
397    }
398
399    // pseudo selector :containsData(data)
400    private Evaluator containsData() {
401        String searchText = TokenQueue.unescape(consumeParens());
402        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
403        return new Evaluator.ContainsData(searchText);
404    }
405
406    // :matches(regex), matchesOwn(regex)
407    private Evaluator matches(boolean own) {
408        String query = own ? ":matchesOwn" : ":matches";
409        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
410        Validate.notEmpty(regex, query + "(regex) query must not be empty");
411
412        return own
413            ? new Evaluator.MatchesOwn(Pattern.compile(regex))
414            : new Evaluator.Matches(Pattern.compile(regex));
415    }
416
417    // :matches(regex), matchesOwn(regex)
418    private Evaluator matchesWholeText(boolean own) {
419        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
420        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
421        Validate.notEmpty(regex, query + "(regex) query must not be empty");
422
423        return own
424            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
425            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
426    }
427
428    // :not(selector)
429    private Evaluator not() {
430        String subQuery = consumeParens();
431        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
432
433        return new StructuralEvaluator.Not(parse(subQuery));
434    }
435
436    @Override
437    public String toString() {
438        return query;
439    }
440}