001package org.jsoup.select;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.parser.TokenQueue;
006
007import java.util.ArrayList;
008import java.util.List;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
013import static org.jsoup.internal.Normalizer.normalize;
014
015/**
016 * Parses a CSS selector into an Evaluator tree.
017 */
018public class QueryParser {
019    private final static char[] Combinators = {',', '>', '+', '~', ' '};
020    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
021
022    private final TokenQueue tq;
023    private final String query;
024    private final List<Evaluator> evals = new ArrayList<>();
025
026    /**
027     * Create a new QueryParser.
028     * @param query CSS query
029     */
030    private QueryParser(String query) {
031        Validate.notEmpty(query);
032        query = query.trim();
033        this.query = query;
034        this.tq = new TokenQueue(query);
035    }
036
037    /**
038     * Parse a CSS query into an Evaluator.
039     * @param query CSS query
040     * @return Evaluator
041     * @see Selector selector query syntax
042     */
043    public static Evaluator parse(String query) {
044        try {
045            QueryParser p = new QueryParser(query);
046            return p.parse();
047        } catch (IllegalArgumentException e) {
048            throw new Selector.SelectorParseException(e.getMessage());
049        }
050    }
051
052    /**
053     * Parse the query
054     * @return Evaluator
055     */
056    Evaluator parse() {
057        tq.consumeWhitespace();
058
059        if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements
060            evals.add(new StructuralEvaluator.Root());
061            combinator(tq.consume());
062        } else {
063            evals.add(consumeEvaluator());
064        }
065
066        while (!tq.isEmpty()) {
067            // hierarchy and extras
068            boolean seenWhite = tq.consumeWhitespace();
069
070            if (tq.matchesAny(Combinators)) {
071                combinator(tq.consume());
072            } else if (seenWhite) {
073                combinator(' ');
074            } else { // E.class, E#id, E[attr] etc. AND
075                evals.add(consumeEvaluator()); // take next el, #. etc off queue
076            }
077        }
078
079        if (evals.size() == 1)
080            return evals.get(0);
081
082        return new CombiningEvaluator.And(evals);
083    }
084
085    private void combinator(char combinator) {
086        tq.consumeWhitespace();
087        String subQuery = consumeSubQuery(); // support multi > childs
088
089        Evaluator rootEval; // the new topmost evaluator
090        Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or.
091        Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator
092        boolean replaceRightMost = false;
093
094        if (evals.size() == 1) {
095            rootEval = currentEval = evals.get(0);
096            // make sure OR (,) has precedence:
097            if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
098                currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator();
099                assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point
100                replaceRightMost = true;
101            }
102        }
103        else {
104            rootEval = currentEval = new CombiningEvaluator.And(evals);
105        }
106        evals.clear();
107
108        // for most combinators: change the current eval into an AND of the current eval and the new eval
109        switch (combinator) {
110            case '>':
111                ImmediateParentRun run = currentEval instanceof ImmediateParentRun ?
112                        (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval);
113                run.add(newEval);
114                currentEval = run;
115                break;
116            case ' ':
117                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Parent(currentEval), newEval);
118                break;
119            case '+':
120                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval);
121                break;
122            case '~':
123                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval);
124                break;
125            case ',':
126                CombiningEvaluator.Or or;
127                if (currentEval instanceof CombiningEvaluator.Or) {
128                    or = (CombiningEvaluator.Or) currentEval;
129                } else {
130                    or = new CombiningEvaluator.Or();
131                    or.add(currentEval);
132                }
133                or.add(newEval);
134                currentEval = or;
135                break;
136            default:
137                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
138        }
139
140        if (replaceRightMost)
141            ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval);
142        else rootEval = currentEval;
143        evals.add(rootEval);
144    }
145
146    private String consumeSubQuery() {
147        StringBuilder sq = StringUtil.borrowBuilder();
148        boolean seenClause = false; // eat until we hit a combinator after eating something else
149        while (!tq.isEmpty()) {
150            if (tq.matchesAny(Combinators)) {
151                if (seenClause)
152                    break;
153                sq.append(tq.consume());
154                continue;
155            }
156            seenClause = true;
157            if (tq.matches("("))
158                sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
159            else if (tq.matches("["))
160                sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
161            else if (tq.matches("\\")) { // bounce over escapes
162                sq.append(tq.consume());
163                if (!tq.isEmpty()) sq.append(tq.consume());
164            } else
165                sq.append(tq.consume());
166        }
167        return StringUtil.releaseBuilder(sq);
168    }
169
170    private Evaluator consumeEvaluator() {
171        if (tq.matchChomp("#"))
172            return byId();
173        else if (tq.matchChomp("."))
174            return byClass();
175        else if (tq.matchesWord() || tq.matches("*|"))
176            return byTag();
177        else if (tq.matches("["))
178            return byAttribute();
179        else if (tq.matchChomp("*"))
180            return new Evaluator.AllElements();
181        else if (tq.matchChomp(":"))
182            return parsePseudoSelector();
183                else // unhandled
184            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
185    }
186
187    private Evaluator parsePseudoSelector() {
188        final String pseudo = tq.consumeCssIdentifier();
189        switch (pseudo) {
190            case "lt":
191                return new Evaluator.IndexLessThan(consumeIndex());
192            case "gt":
193                return new Evaluator.IndexGreaterThan(consumeIndex());
194            case "eq":
195                return new Evaluator.IndexEquals(consumeIndex());
196            case "has":
197                return has();
198            case "is":
199                return is();
200            case "contains":
201                return contains(false);
202            case "containsOwn":
203                return contains(true);
204            case "containsWholeText":
205                return containsWholeText(false);
206            case "containsWholeOwnText":
207                return containsWholeText(true);
208            case "containsData":
209                return containsData();
210            case "matches":
211                return matches(false);
212            case "matchesOwn":
213                return matches(true);
214            case "matchesWholeText":
215                return matchesWholeText(false);
216            case "matchesWholeOwnText":
217                return matchesWholeText(true);
218            case "not":
219                return not();
220            case "nth-child":
221                return cssNthChild(false, false);
222            case "nth-last-child":
223                return cssNthChild(true, false);
224            case "nth-of-type":
225                return cssNthChild(false, true);
226            case "nth-last-of-type":
227                return cssNthChild(true, true);
228            case "first-child":
229                return new Evaluator.IsFirstChild();
230            case "last-child":
231                return new Evaluator.IsLastChild();
232            case "first-of-type":
233                return new Evaluator.IsFirstOfType();
234            case "last-of-type":
235                return new Evaluator.IsLastOfType();
236            case "only-child":
237                return new Evaluator.IsOnlyChild();
238            case "only-of-type":
239                return new Evaluator.IsOnlyOfType();
240            case "empty":
241                return new Evaluator.IsEmpty();
242            case "root":
243                return new Evaluator.IsRoot();
244            case "matchText":
245                return new Evaluator.MatchText();
246            default:
247                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
248        }
249    }
250
251    private Evaluator byId() {
252        String id = tq.consumeCssIdentifier();
253        Validate.notEmpty(id);
254        return new Evaluator.Id(id);
255    }
256
257    private Evaluator byClass() {
258        String className = tq.consumeCssIdentifier();
259        Validate.notEmpty(className);
260        return new Evaluator.Class(className.trim());
261    }
262
263    private Evaluator byTag() {
264        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
265        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
266        // consistency - both the selector and the element tag
267        String tagName = normalize(tq.consumeElementSelector());
268        Validate.notEmpty(tagName);
269
270        // namespaces:
271        if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
272            String plainTag = tagName.substring(2); // strip *|
273            return new CombiningEvaluator.Or(
274                new Evaluator.Tag(plainTag),
275                new Evaluator.TagEndsWith(":" + plainTag)
276            );
277        } else if (tagName.endsWith("|*")) { // ns|*
278            String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns:
279            return new Evaluator.TagStartsWith(ns);
280        } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def"
281            tagName = tagName.replace("|", ":");
282        }
283
284        return new Evaluator.Tag(tagName);
285    }
286
287    private Evaluator byAttribute() {
288        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
289        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
290        Validate.notEmpty(key);
291        cq.consumeWhitespace();
292        final Evaluator eval;
293
294        if (cq.isEmpty()) {
295            if (key.startsWith("^"))
296                eval = new Evaluator.AttributeStarting(key.substring(1));
297            else if (key.equals("*")) // any attribute
298                eval = new Evaluator.AttributeStarting("");
299            else
300                eval = new Evaluator.Attribute(key);
301        } else {
302            if (cq.matchChomp("="))
303                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
304            else if (cq.matchChomp("!="))
305                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
306            else if (cq.matchChomp("^="))
307                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
308            else if (cq.matchChomp("$="))
309                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
310            else if (cq.matchChomp("*="))
311                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
312            else if (cq.matchChomp("~="))
313                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
314            else
315                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
316        }
317        return eval;
318    }
319
320    //pseudo selectors :first-child, :last-child, :nth-child, ...
321    private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
322    private static final Pattern NTH_B  = Pattern.compile("([+-])?(\\d+)");
323
324        private Evaluator cssNthChild(boolean backwards, boolean ofType) {
325                String arg = normalize(consumeParens());
326                Matcher mAB = NTH_AB.matcher(arg);
327                Matcher mB = NTH_B.matcher(arg);
328                final int a, b;
329                if ("odd".equals(arg)) {
330                        a = 2;
331                        b = 1;
332                } else if ("even".equals(arg)) {
333                        a = 2;
334                        b = 0;
335                } else if (mAB.matches()) {
336                        a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1;
337                        b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0;
338                } else if (mB.matches()) {
339                        a = 0;
340                        b = Integer.parseInt(mB.group().replaceFirst("^\\+", ""));
341                } else {
342                        throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
343                }
344
345        final Evaluator eval;
346                if (ofType)
347                        if (backwards)
348                                eval = new Evaluator.IsNthLastOfType(a, b);
349                        else
350                                eval = new Evaluator.IsNthOfType(a, b);
351                else {
352                        if (backwards)
353                                eval = (new Evaluator.IsNthLastChild(a, b));
354                        else
355                                eval = new Evaluator.IsNthChild(a, b);
356                }
357        return eval;
358        }
359
360    private String consumeParens() {
361        return tq.chompBalanced('(', ')');
362    }
363
364    private int consumeIndex() {
365        String index = consumeParens().trim();
366        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
367        return Integer.parseInt(index);
368    }
369
370    // pseudo selector :has(el)
371    private Evaluator has() {
372        String subQuery = consumeParens();
373        Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty");
374        return new StructuralEvaluator.Has(parse(subQuery));
375    }
376
377    // psuedo selector :is()
378    private Evaluator is() {
379        String subQuery = consumeParens();
380        Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty");
381        return new StructuralEvaluator.Is(parse(subQuery));
382    }
383
384    // pseudo selector :contains(text), containsOwn(text)
385    private Evaluator contains(boolean own) {
386        String query = own ? ":containsOwn" : ":contains";
387        String searchText = TokenQueue.unescape(consumeParens());
388        Validate.notEmpty(searchText, query + "(text) query must not be empty");
389        return own
390            ? new Evaluator.ContainsOwnText(searchText)
391            : new Evaluator.ContainsText(searchText);
392    }
393
394    private Evaluator containsWholeText(boolean own) {
395        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
396        String searchText = TokenQueue.unescape(consumeParens());
397        Validate.notEmpty(searchText, query + "(text) query must not be empty");
398        return own
399            ? new Evaluator.ContainsWholeOwnText(searchText)
400            : new Evaluator.ContainsWholeText(searchText);
401    }
402
403    // pseudo selector :containsData(data)
404    private Evaluator containsData() {
405        String searchText = TokenQueue.unescape(consumeParens());
406        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
407        return new Evaluator.ContainsData(searchText);
408    }
409
410    // :matches(regex), matchesOwn(regex)
411    private Evaluator matches(boolean own) {
412        String query = own ? ":matchesOwn" : ":matches";
413        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
414        Validate.notEmpty(regex, query + "(regex) query must not be empty");
415
416        return own
417            ? new Evaluator.MatchesOwn(Pattern.compile(regex))
418            : new Evaluator.Matches(Pattern.compile(regex));
419    }
420
421    // :matches(regex), matchesOwn(regex)
422    private Evaluator matchesWholeText(boolean own) {
423        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
424        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
425        Validate.notEmpty(regex, query + "(regex) query must not be empty");
426
427        return own
428            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
429            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
430    }
431
432    // :not(selector)
433    private Evaluator not() {
434        String subQuery = consumeParens();
435        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
436
437        return new StructuralEvaluator.Not(parse(subQuery));
438    }
439
440    @Override
441    public String toString() {
442        return query;
443    }
444}