001package org.jsoup.select; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.parser.TokenQueue; 006 007import java.util.ArrayList; 008import java.util.List; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 013import static org.jsoup.internal.Normalizer.normalize; 014 015/** 016 * Parses a CSS selector into an Evaluator tree. 017 */ 018public class QueryParser { 019 private final static char[] Combinators = {',', '>', '+', '~', ' '}; 020 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 021 022 private final TokenQueue tq; 023 private final String query; 024 private final List<Evaluator> evals = new ArrayList<>(); 025 026 /** 027 * Create a new QueryParser. 028 * @param query CSS query 029 */ 030 private QueryParser(String query) { 031 Validate.notEmpty(query); 032 query = query.trim(); 033 this.query = query; 034 this.tq = new TokenQueue(query); 035 } 036 037 /** 038 * Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to 039 * parse it once and reuse the Evaluator. 040 * @param query CSS query 041 * @return Evaluator 042 * @see Selector selector query syntax 043 */ 044 public static Evaluator parse(String query) { 045 try { 046 QueryParser p = new QueryParser(query); 047 return p.parse(); 048 } catch (IllegalArgumentException e) { 049 throw new Selector.SelectorParseException(e.getMessage()); 050 } 051 } 052 053 /** 054 * Parse the query 055 * @return Evaluator 056 */ 057 Evaluator parse() { 058 tq.consumeWhitespace(); 059 060 if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements 061 evals.add(new StructuralEvaluator.Root()); 062 combinator(tq.consume()); 063 } else { 064 evals.add(consumeEvaluator()); 065 } 066 067 while (!tq.isEmpty()) { 068 // hierarchy and extras 069 boolean seenWhite = tq.consumeWhitespace(); 070 071 if (tq.matchesAny(Combinators)) { 072 combinator(tq.consume()); 073 } else if (seenWhite) { 074 combinator(' '); 075 } else { // E.class, E#id, E[attr] etc. AND 076 evals.add(consumeEvaluator()); // take next el, #. etc off queue 077 } 078 } 079 080 if (evals.size() == 1) 081 return evals.get(0); 082 083 return new CombiningEvaluator.And(evals); 084 } 085 086 private void combinator(char combinator) { 087 tq.consumeWhitespace(); 088 String subQuery = consumeSubQuery(); // support multi > childs 089 090 Evaluator rootEval; // the new topmost evaluator 091 Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. 092 Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator 093 boolean replaceRightMost = false; 094 095 if (evals.size() == 1) { 096 rootEval = currentEval = evals.get(0); 097 // make sure OR (,) has precedence: 098 if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { 099 currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); 100 assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point 101 replaceRightMost = true; 102 } 103 } 104 else { 105 rootEval = currentEval = new CombiningEvaluator.And(evals); 106 } 107 evals.clear(); 108 109 // for most combinators: change the current eval into an AND of the current eval and the new eval 110 switch (combinator) { 111 case '>': 112 ImmediateParentRun run = currentEval instanceof ImmediateParentRun ? 113 (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval); 114 run.add(newEval); 115 currentEval = run; 116 break; 117 case ' ': 118 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Ancestor(currentEval), newEval); 119 break; 120 case '+': 121 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval); 122 break; 123 case '~': 124 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval); 125 break; 126 case ',': 127 CombiningEvaluator.Or or; 128 if (currentEval instanceof CombiningEvaluator.Or) { 129 or = (CombiningEvaluator.Or) currentEval; 130 } else { 131 or = new CombiningEvaluator.Or(); 132 or.add(currentEval); 133 } 134 or.add(newEval); 135 currentEval = or; 136 break; 137 default: 138 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 139 } 140 141 if (replaceRightMost) 142 ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); 143 else rootEval = currentEval; 144 evals.add(rootEval); 145 } 146 147 private String consumeSubQuery() { 148 StringBuilder sq = StringUtil.borrowBuilder(); 149 boolean seenClause = false; // eat until we hit a combinator after eating something else 150 while (!tq.isEmpty()) { 151 if (tq.matchesAny(Combinators)) { 152 if (seenClause) 153 break; 154 sq.append(tq.consume()); 155 continue; 156 } 157 seenClause = true; 158 if (tq.matches("(")) 159 sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); 160 else if (tq.matches("[")) 161 sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); 162 else if (tq.matches("\\")) { // bounce over escapes 163 sq.append(tq.consume()); 164 if (!tq.isEmpty()) sq.append(tq.consume()); 165 } else 166 sq.append(tq.consume()); 167 } 168 return StringUtil.releaseBuilder(sq); 169 } 170 171 private Evaluator consumeEvaluator() { 172 if (tq.matchChomp("#")) 173 return byId(); 174 else if (tq.matchChomp(".")) 175 return byClass(); 176 else if (tq.matchesWord() || tq.matches("*|")) 177 return byTag(); 178 else if (tq.matches("[")) 179 return byAttribute(); 180 else if (tq.matchChomp("*")) 181 return new Evaluator.AllElements(); 182 else if (tq.matchChomp(":")) 183 return parsePseudoSelector(); 184 else // unhandled 185 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 186 } 187 188 private Evaluator parsePseudoSelector() { 189 final String pseudo = tq.consumeCssIdentifier(); 190 switch (pseudo) { 191 case "lt": 192 return new Evaluator.IndexLessThan(consumeIndex()); 193 case "gt": 194 return new Evaluator.IndexGreaterThan(consumeIndex()); 195 case "eq": 196 return new Evaluator.IndexEquals(consumeIndex()); 197 case "has": 198 return has(); 199 case "is": 200 return is(); 201 case "contains": 202 return contains(false); 203 case "containsOwn": 204 return contains(true); 205 case "containsWholeText": 206 return containsWholeText(false); 207 case "containsWholeOwnText": 208 return containsWholeText(true); 209 case "containsData": 210 return containsData(); 211 case "matches": 212 return matches(false); 213 case "matchesOwn": 214 return matches(true); 215 case "matchesWholeText": 216 return matchesWholeText(false); 217 case "matchesWholeOwnText": 218 return matchesWholeText(true); 219 case "not": 220 return not(); 221 case "nth-child": 222 return cssNthChild(false, false); 223 case "nth-last-child": 224 return cssNthChild(true, false); 225 case "nth-of-type": 226 return cssNthChild(false, true); 227 case "nth-last-of-type": 228 return cssNthChild(true, true); 229 case "first-child": 230 return new Evaluator.IsFirstChild(); 231 case "last-child": 232 return new Evaluator.IsLastChild(); 233 case "first-of-type": 234 return new Evaluator.IsFirstOfType(); 235 case "last-of-type": 236 return new Evaluator.IsLastOfType(); 237 case "only-child": 238 return new Evaluator.IsOnlyChild(); 239 case "only-of-type": 240 return new Evaluator.IsOnlyOfType(); 241 case "empty": 242 return new Evaluator.IsEmpty(); 243 case "root": 244 return new Evaluator.IsRoot(); 245 case "matchText": 246 return new Evaluator.MatchText(); 247 default: 248 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 249 } 250 } 251 252 private Evaluator byId() { 253 String id = tq.consumeCssIdentifier(); 254 Validate.notEmpty(id); 255 return new Evaluator.Id(id); 256 } 257 258 private Evaluator byClass() { 259 String className = tq.consumeCssIdentifier(); 260 Validate.notEmpty(className); 261 return new Evaluator.Class(className.trim()); 262 } 263 264 private Evaluator byTag() { 265 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 266 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 267 // consistency - both the selector and the element tag 268 String tagName = normalize(tq.consumeElementSelector()); 269 Validate.notEmpty(tagName); 270 271 // namespaces: 272 if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 273 String plainTag = tagName.substring(2); // strip *| 274 return new CombiningEvaluator.Or( 275 new Evaluator.Tag(plainTag), 276 new Evaluator.TagEndsWith(":" + plainTag) 277 ); 278 } else if (tagName.endsWith("|*")) { // ns|* 279 String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: 280 return new Evaluator.TagStartsWith(ns); 281 } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" 282 tagName = tagName.replace("|", ":"); 283 } 284 285 return new Evaluator.Tag(tagName); 286 } 287 288 private Evaluator byAttribute() { 289 TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue 290 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 291 Validate.notEmpty(key); 292 cq.consumeWhitespace(); 293 final Evaluator eval; 294 295 if (cq.isEmpty()) { 296 if (key.startsWith("^")) 297 eval = new Evaluator.AttributeStarting(key.substring(1)); 298 else if (key.equals("*")) // any attribute 299 eval = new Evaluator.AttributeStarting(""); 300 else 301 eval = new Evaluator.Attribute(key); 302 } else { 303 if (cq.matchChomp("=")) 304 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 305 else if (cq.matchChomp("!=")) 306 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 307 else if (cq.matchChomp("^=")) 308 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 309 else if (cq.matchChomp("$=")) 310 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 311 else if (cq.matchChomp("*=")) 312 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 313 else if (cq.matchChomp("~=")) 314 eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())); 315 else 316 throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 317 } 318 return eval; 319 } 320 321 //pseudo selectors :first-child, :last-child, :nth-child, ... 322 private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 323 private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)"); 324 325 private Evaluator cssNthChild(boolean last, boolean ofType) { 326 String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd) 327 final int step, offset; 328 if ("odd".equals(arg)) { 329 step = 2; 330 offset = 1; 331 } else if ("even".equals(arg)) { 332 step = 2; 333 offset = 0; 334 } else { 335 Matcher stepOffsetM, stepM; 336 if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) { 337 if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2 338 step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", "")); 339 else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1; 340 step = "-".equals(stepOffsetM.group(2)) ? -1 : 1; 341 offset = 342 stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0; 343 } else if ((stepM = NthOffset.matcher(arg)).matches()) { 344 step = 0; 345 offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", "")); 346 } else { 347 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 348 } 349 } 350 351 return ofType 352 ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset)) 353 : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset)); 354 } 355 356 private String consumeParens() { 357 return tq.chompBalanced('(', ')'); 358 } 359 360 private int consumeIndex() { 361 String index = consumeParens().trim(); 362 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 363 return Integer.parseInt(index); 364 } 365 366 // pseudo selector :has(el) 367 private Evaluator has() { 368 String subQuery = consumeParens(); 369 Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty"); 370 return new StructuralEvaluator.Has(parse(subQuery)); 371 } 372 373 // psuedo selector :is() 374 private Evaluator is() { 375 String subQuery = consumeParens(); 376 Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty"); 377 return new StructuralEvaluator.Is(parse(subQuery)); 378 } 379 380 // pseudo selector :contains(text), containsOwn(text) 381 private Evaluator contains(boolean own) { 382 String query = own ? ":containsOwn" : ":contains"; 383 String searchText = TokenQueue.unescape(consumeParens()); 384 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 385 return own 386 ? new Evaluator.ContainsOwnText(searchText) 387 : new Evaluator.ContainsText(searchText); 388 } 389 390 private Evaluator containsWholeText(boolean own) { 391 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 392 String searchText = TokenQueue.unescape(consumeParens()); 393 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 394 return own 395 ? new Evaluator.ContainsWholeOwnText(searchText) 396 : new Evaluator.ContainsWholeText(searchText); 397 } 398 399 // pseudo selector :containsData(data) 400 private Evaluator containsData() { 401 String searchText = TokenQueue.unescape(consumeParens()); 402 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 403 return new Evaluator.ContainsData(searchText); 404 } 405 406 // :matches(regex), matchesOwn(regex) 407 private Evaluator matches(boolean own) { 408 String query = own ? ":matchesOwn" : ":matches"; 409 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 410 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 411 412 return own 413 ? new Evaluator.MatchesOwn(Pattern.compile(regex)) 414 : new Evaluator.Matches(Pattern.compile(regex)); 415 } 416 417 // :matches(regex), matchesOwn(regex) 418 private Evaluator matchesWholeText(boolean own) { 419 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 420 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 421 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 422 423 return own 424 ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex)) 425 : new Evaluator.MatchesWholeText(Pattern.compile(regex)); 426 } 427 428 // :not(selector) 429 private Evaluator not() { 430 String subQuery = consumeParens(); 431 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 432 433 return new StructuralEvaluator.Not(parse(subQuery)); 434 } 435 436 @Override 437 public String toString() { 438 return query; 439 } 440}