001package org.jsoup.select; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.parser.TokenQueue; 006 007import java.util.ArrayList; 008import java.util.List; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 013import static org.jsoup.internal.Normalizer.normalize; 014 015/** 016 * Parses a CSS selector into an Evaluator tree. 017 */ 018public class QueryParser { 019 private final static char[] Combinators = {',', '>', '+', '~', ' '}; 020 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 021 022 private final TokenQueue tq; 023 private final String query; 024 private final List<Evaluator> evals = new ArrayList<>(); 025 026 /** 027 * Create a new QueryParser. 028 * @param query CSS query 029 */ 030 private QueryParser(String query) { 031 Validate.notEmpty(query); 032 query = query.trim(); 033 this.query = query; 034 this.tq = new TokenQueue(query); 035 } 036 037 /** 038 * Parse a CSS query into an Evaluator. 039 * @param query CSS query 040 * @return Evaluator 041 * @see Selector selector query syntax 042 */ 043 public static Evaluator parse(String query) { 044 try { 045 QueryParser p = new QueryParser(query); 046 return p.parse(); 047 } catch (IllegalArgumentException e) { 048 throw new Selector.SelectorParseException(e.getMessage()); 049 } 050 } 051 052 /** 053 * Parse the query 054 * @return Evaluator 055 */ 056 Evaluator parse() { 057 tq.consumeWhitespace(); 058 059 if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements 060 evals.add(new StructuralEvaluator.Root()); 061 combinator(tq.consume()); 062 } else { 063 evals.add(consumeEvaluator()); 064 } 065 066 while (!tq.isEmpty()) { 067 // hierarchy and extras 068 boolean seenWhite = tq.consumeWhitespace(); 069 070 if (tq.matchesAny(Combinators)) { 071 combinator(tq.consume()); 072 } else if (seenWhite) { 073 combinator(' '); 074 } else { // E.class, E#id, E[attr] etc. AND 075 evals.add(consumeEvaluator()); // take next el, #. etc off queue 076 } 077 } 078 079 if (evals.size() == 1) 080 return evals.get(0); 081 082 return new CombiningEvaluator.And(evals); 083 } 084 085 private void combinator(char combinator) { 086 tq.consumeWhitespace(); 087 String subQuery = consumeSubQuery(); // support multi > childs 088 089 Evaluator rootEval; // the new topmost evaluator 090 Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. 091 Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator 092 boolean replaceRightMost = false; 093 094 if (evals.size() == 1) { 095 rootEval = currentEval = evals.get(0); 096 // make sure OR (,) has precedence: 097 if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { 098 currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); 099 assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point 100 replaceRightMost = true; 101 } 102 } 103 else { 104 rootEval = currentEval = new CombiningEvaluator.And(evals); 105 } 106 evals.clear(); 107 108 // for most combinators: change the current eval into an AND of the current eval and the new eval 109 switch (combinator) { 110 case '>': 111 ImmediateParentRun run = currentEval instanceof ImmediateParentRun ? 112 (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval); 113 run.add(newEval); 114 currentEval = run; 115 break; 116 case ' ': 117 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Parent(currentEval), newEval); 118 break; 119 case '+': 120 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval); 121 break; 122 case '~': 123 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval); 124 break; 125 case ',': 126 CombiningEvaluator.Or or; 127 if (currentEval instanceof CombiningEvaluator.Or) { 128 or = (CombiningEvaluator.Or) currentEval; 129 } else { 130 or = new CombiningEvaluator.Or(); 131 or.add(currentEval); 132 } 133 or.add(newEval); 134 currentEval = or; 135 break; 136 default: 137 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 138 } 139 140 if (replaceRightMost) 141 ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); 142 else rootEval = currentEval; 143 evals.add(rootEval); 144 } 145 146 private String consumeSubQuery() { 147 StringBuilder sq = StringUtil.borrowBuilder(); 148 boolean seenClause = false; // eat until we hit a combinator after eating something else 149 while (!tq.isEmpty()) { 150 if (tq.matchesAny(Combinators)) { 151 if (seenClause) 152 break; 153 sq.append(tq.consume()); 154 continue; 155 } 156 seenClause = true; 157 if (tq.matches("(")) 158 sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); 159 else if (tq.matches("[")) 160 sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); 161 else if (tq.matches("\\")) { // bounce over escapes 162 sq.append(tq.consume()); 163 if (!tq.isEmpty()) sq.append(tq.consume()); 164 } else 165 sq.append(tq.consume()); 166 } 167 return StringUtil.releaseBuilder(sq); 168 } 169 170 private Evaluator consumeEvaluator() { 171 if (tq.matchChomp("#")) 172 return byId(); 173 else if (tq.matchChomp(".")) 174 return byClass(); 175 else if (tq.matchesWord() || tq.matches("*|")) 176 return byTag(); 177 else if (tq.matches("[")) 178 return byAttribute(); 179 else if (tq.matchChomp("*")) 180 return new Evaluator.AllElements(); 181 else if (tq.matchChomp(":")) 182 return parsePseudoSelector(); 183 else // unhandled 184 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 185 } 186 187 private Evaluator parsePseudoSelector() { 188 final String pseudo = tq.consumeCssIdentifier(); 189 switch (pseudo) { 190 case "lt": 191 return new Evaluator.IndexLessThan(consumeIndex()); 192 case "gt": 193 return new Evaluator.IndexGreaterThan(consumeIndex()); 194 case "eq": 195 return new Evaluator.IndexEquals(consumeIndex()); 196 case "has": 197 return has(); 198 case "is": 199 return is(); 200 case "contains": 201 return contains(false); 202 case "containsOwn": 203 return contains(true); 204 case "containsWholeText": 205 return containsWholeText(false); 206 case "containsWholeOwnText": 207 return containsWholeText(true); 208 case "containsData": 209 return containsData(); 210 case "matches": 211 return matches(false); 212 case "matchesOwn": 213 return matches(true); 214 case "matchesWholeText": 215 return matchesWholeText(false); 216 case "matchesWholeOwnText": 217 return matchesWholeText(true); 218 case "not": 219 return not(); 220 case "nth-child": 221 return cssNthChild(false, false); 222 case "nth-last-child": 223 return cssNthChild(true, false); 224 case "nth-of-type": 225 return cssNthChild(false, true); 226 case "nth-last-of-type": 227 return cssNthChild(true, true); 228 case "first-child": 229 return new Evaluator.IsFirstChild(); 230 case "last-child": 231 return new Evaluator.IsLastChild(); 232 case "first-of-type": 233 return new Evaluator.IsFirstOfType(); 234 case "last-of-type": 235 return new Evaluator.IsLastOfType(); 236 case "only-child": 237 return new Evaluator.IsOnlyChild(); 238 case "only-of-type": 239 return new Evaluator.IsOnlyOfType(); 240 case "empty": 241 return new Evaluator.IsEmpty(); 242 case "root": 243 return new Evaluator.IsRoot(); 244 case "matchText": 245 return new Evaluator.MatchText(); 246 default: 247 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 248 } 249 } 250 251 private Evaluator byId() { 252 String id = tq.consumeCssIdentifier(); 253 Validate.notEmpty(id); 254 return new Evaluator.Id(id); 255 } 256 257 private Evaluator byClass() { 258 String className = tq.consumeCssIdentifier(); 259 Validate.notEmpty(className); 260 return new Evaluator.Class(className.trim()); 261 } 262 263 private Evaluator byTag() { 264 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 265 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 266 // consistency - both the selector and the element tag 267 String tagName = normalize(tq.consumeElementSelector()); 268 Validate.notEmpty(tagName); 269 270 // namespaces: 271 if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 272 String plainTag = tagName.substring(2); // strip *| 273 return new CombiningEvaluator.Or( 274 new Evaluator.Tag(plainTag), 275 new Evaluator.TagEndsWith(":" + plainTag) 276 ); 277 } else if (tagName.endsWith("|*")) { // ns|* 278 String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: 279 return new Evaluator.TagStartsWith(ns); 280 } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" 281 tagName = tagName.replace("|", ":"); 282 } 283 284 return new Evaluator.Tag(tagName); 285 } 286 287 private Evaluator byAttribute() { 288 TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue 289 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 290 Validate.notEmpty(key); 291 cq.consumeWhitespace(); 292 final Evaluator eval; 293 294 if (cq.isEmpty()) { 295 if (key.startsWith("^")) 296 eval = new Evaluator.AttributeStarting(key.substring(1)); 297 else if (key.equals("*")) // any attribute 298 eval = new Evaluator.AttributeStarting(""); 299 else 300 eval = new Evaluator.Attribute(key); 301 } else { 302 if (cq.matchChomp("=")) 303 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 304 else if (cq.matchChomp("!=")) 305 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 306 else if (cq.matchChomp("^=")) 307 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 308 else if (cq.matchChomp("$=")) 309 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 310 else if (cq.matchChomp("*=")) 311 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 312 else if (cq.matchChomp("~=")) 313 eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())); 314 else 315 throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 316 } 317 return eval; 318 } 319 320 //pseudo selectors :first-child, :last-child, :nth-child, ... 321 private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 322 private static final Pattern NTH_B = Pattern.compile("([+-])?(\\d+)"); 323 324 private Evaluator cssNthChild(boolean backwards, boolean ofType) { 325 String arg = normalize(consumeParens()); 326 Matcher mAB = NTH_AB.matcher(arg); 327 Matcher mB = NTH_B.matcher(arg); 328 final int a, b; 329 if ("odd".equals(arg)) { 330 a = 2; 331 b = 1; 332 } else if ("even".equals(arg)) { 333 a = 2; 334 b = 0; 335 } else if (mAB.matches()) { 336 a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1; 337 b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0; 338 } else if (mB.matches()) { 339 a = 0; 340 b = Integer.parseInt(mB.group().replaceFirst("^\\+", "")); 341 } else { 342 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 343 } 344 345 final Evaluator eval; 346 if (ofType) 347 if (backwards) 348 eval = new Evaluator.IsNthLastOfType(a, b); 349 else 350 eval = new Evaluator.IsNthOfType(a, b); 351 else { 352 if (backwards) 353 eval = (new Evaluator.IsNthLastChild(a, b)); 354 else 355 eval = new Evaluator.IsNthChild(a, b); 356 } 357 return eval; 358 } 359 360 private String consumeParens() { 361 return tq.chompBalanced('(', ')'); 362 } 363 364 private int consumeIndex() { 365 String index = consumeParens().trim(); 366 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 367 return Integer.parseInt(index); 368 } 369 370 // pseudo selector :has(el) 371 private Evaluator has() { 372 String subQuery = consumeParens(); 373 Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty"); 374 return new StructuralEvaluator.Has(parse(subQuery)); 375 } 376 377 // psuedo selector :is() 378 private Evaluator is() { 379 String subQuery = consumeParens(); 380 Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty"); 381 return new StructuralEvaluator.Is(parse(subQuery)); 382 } 383 384 // pseudo selector :contains(text), containsOwn(text) 385 private Evaluator contains(boolean own) { 386 String query = own ? ":containsOwn" : ":contains"; 387 String searchText = TokenQueue.unescape(consumeParens()); 388 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 389 return own 390 ? new Evaluator.ContainsOwnText(searchText) 391 : new Evaluator.ContainsText(searchText); 392 } 393 394 private Evaluator containsWholeText(boolean own) { 395 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 396 String searchText = TokenQueue.unescape(consumeParens()); 397 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 398 return own 399 ? new Evaluator.ContainsWholeOwnText(searchText) 400 : new Evaluator.ContainsWholeText(searchText); 401 } 402 403 // pseudo selector :containsData(data) 404 private Evaluator containsData() { 405 String searchText = TokenQueue.unescape(consumeParens()); 406 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 407 return new Evaluator.ContainsData(searchText); 408 } 409 410 // :matches(regex), matchesOwn(regex) 411 private Evaluator matches(boolean own) { 412 String query = own ? ":matchesOwn" : ":matches"; 413 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 414 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 415 416 return own 417 ? new Evaluator.MatchesOwn(Pattern.compile(regex)) 418 : new Evaluator.Matches(Pattern.compile(regex)); 419 } 420 421 // :matches(regex), matchesOwn(regex) 422 private Evaluator matchesWholeText(boolean own) { 423 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 424 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 425 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 426 427 return own 428 ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex)) 429 : new Evaluator.MatchesWholeText(Pattern.compile(regex)); 430 } 431 432 // :not(selector) 433 private Evaluator not() { 434 String subQuery = consumeParens(); 435 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 436 437 return new StructuralEvaluator.Not(parse(subQuery)); 438 } 439 440 @Override 441 public String toString() { 442 return query; 443 } 444}