001package org.jsoup.select; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.parser.TokenQueue; 006import org.jspecify.annotations.Nullable; 007 008import java.util.function.Function; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 013import static org.jsoup.internal.Normalizer.normalize; 014 015/** 016 * Parses a CSS selector into an Evaluator tree. 017 */ 018public class QueryParser { 019 private final static char[] Combinators = {'>', '+', '~'}; // ' ' is also a combinator, but found implicitly 020 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 021 private final static char[] SequenceEnders = {',', ')'}; 022 023 private final TokenQueue tq; 024 private final String query; 025 026 /** 027 * Create a new QueryParser. 028 * @param query CSS query 029 */ 030 private QueryParser(String query) { 031 Validate.notEmpty(query); 032 query = query.trim(); 033 this.query = query; 034 this.tq = new TokenQueue(query); 035 } 036 037 /** 038 Parse a CSS query into an Evaluator. If you are evaluating the same query repeatedly, it may be more efficient to 039 parse it once and reuse the Evaluator. 040 041 @param query CSS query 042 @return Evaluator 043 @see Selector selector query syntax 044 */ 045 public static Evaluator parse(String query) { 046 try { 047 QueryParser p = new QueryParser(query); 048 return p.parse(); 049 } catch (IllegalArgumentException e) { 050 throw new Selector.SelectorParseException(e.getMessage()); 051 } 052 } 053 054 /** 055 Parse the query. We use this simplified expression of the grammar: 056 <pre> 057 SelectorGroup ::= Selector (',' Selector)* 058 Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 059 SimpleSequence ::= [ TypeSelector ] ( ID | Class | Attribute | Pseudo )* 060 Pseudo ::= ':' Name [ '(' SelectorGroup ')' ] 061 Combinator ::= S+ // descendant (whitespace) 062 | '>' // child 063 | '+' // adjacent sibling 064 | '~' // general sibling 065 </pre> 066 067 See <a href="https://www.w3.org/TR/selectors-4/#grammar">selectors-4</a> for the real thing 068 */ 069 Evaluator parse() { 070 Evaluator eval = parseSelectorGroup(); 071 tq.consumeWhitespace(); 072 if (!tq.isEmpty()) 073 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 074 return eval; 075 } 076 077 Evaluator parseSelectorGroup() { 078 // SelectorGroup. Into an Or if > 1 Selector 079 Evaluator left = parseSelector(); 080 while (tq.matchChomp(',')) { 081 Evaluator right = parseSelector(); 082 left = or(left, right); 083 } 084 return left; 085 } 086 087 Evaluator parseSelector() { 088 // Selector ::= [ Combinator ] SimpleSequence ( Combinator SimpleSequence )* 089 tq.consumeWhitespace(); 090 091 Evaluator left; 092 if (tq.matchesAny(Combinators)) { 093 // e.g. query is "> div"; left side is root element 094 left = new StructuralEvaluator.Root(); 095 } else { 096 left = parseSimpleSequence(); 097 } 098 099 while (true) { 100 char combinator = 0; 101 if (tq.consumeWhitespace()) 102 combinator = ' '; // maybe descendant? 103 if (tq.matchesAny(Combinators)) // no, explicit 104 combinator = tq.consume(); 105 else if (tq.matchesAny(SequenceEnders)) // , - space after simple like "foo , bar"; ) - close of :has() 106 break; 107 108 if (combinator != 0) { 109 Evaluator right = parseSimpleSequence(); 110 left = combinator(left, combinator, right); 111 } else { 112 break; 113 } 114 } 115 return left; 116 } 117 118 Evaluator parseSimpleSequence() { 119 // SimpleSequence ::= TypeSelector? ( Hash | Class | Pseudo )* 120 Evaluator left = null; 121 tq.consumeWhitespace(); 122 123 // one optional type selector 124 if (tq.matchesWord() || tq.matches("*|")) 125 left = byTag(); 126 else if (tq.matchChomp('*')) 127 left = new Evaluator.AllElements(); 128 129 // zero or more subclasses (#, ., [) 130 while(true) { 131 Evaluator right = parseSubclass(); 132 if (right != null) 133 left = and(left, right); 134 else break; // no more simple tokens 135 } 136 137 if (left == null) 138 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 139 return left; 140 } 141 142 static Evaluator combinator(Evaluator left, char combinator, Evaluator right) { 143 switch (combinator) { 144 case '>': 145 ImmediateParentRun run = left instanceof ImmediateParentRun ? 146 (ImmediateParentRun) left : new ImmediateParentRun(left); 147 run.add(right); 148 return run; 149 case ' ': 150 return and(new StructuralEvaluator.Ancestor(left), right); 151 case '+': 152 return and(new StructuralEvaluator.ImmediatePreviousSibling(left), right); 153 case '~': 154 return and(new StructuralEvaluator.PreviousSibling(left), right); 155 default: 156 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 157 } 158 } 159 160 @Nullable Evaluator parseSubclass() { 161 // Subclass: ID | Class | Attribute | Pseudo 162 if (tq.matchChomp('#')) return byId(); 163 else if (tq.matchChomp('.')) return byClass(); 164 else if (tq.matches('[')) return byAttribute(); 165 else if (tq.matchChomp(':')) return parsePseudoSelector(); 166 else return null; 167 } 168 169 /** Merge two evals into an Or. */ 170 static Evaluator or(Evaluator left, Evaluator right) { 171 if (left instanceof CombiningEvaluator.Or) { 172 ((CombiningEvaluator.Or) left).add(right); 173 return left; 174 } 175 return new CombiningEvaluator.Or(left, right); 176 } 177 178 /** Merge two evals into an And. */ 179 static Evaluator and(@Nullable Evaluator left, Evaluator right) { 180 if (left == null) return right; 181 if (left instanceof CombiningEvaluator.And) { 182 ((CombiningEvaluator.And) left).add(right); 183 return left; 184 } 185 return new CombiningEvaluator.And(left, right); 186 } 187 188 private Evaluator parsePseudoSelector() { 189 final String pseudo = tq.consumeCssIdentifier(); 190 switch (pseudo) { 191 case "lt": 192 return new Evaluator.IndexLessThan(consumeIndex()); 193 case "gt": 194 return new Evaluator.IndexGreaterThan(consumeIndex()); 195 case "eq": 196 return new Evaluator.IndexEquals(consumeIndex()); 197 case "has": 198 return has(); 199 case "is": 200 return is(); 201 case "contains": 202 return contains(false); 203 case "containsOwn": 204 return contains(true); 205 case "containsWholeText": 206 return containsWholeText(false); 207 case "containsWholeOwnText": 208 return containsWholeText(true); 209 case "containsData": 210 return containsData(); 211 case "matches": 212 return matches(false); 213 case "matchesOwn": 214 return matches(true); 215 case "matchesWholeText": 216 return matchesWholeText(false); 217 case "matchesWholeOwnText": 218 return matchesWholeText(true); 219 case "not": 220 return not(); 221 case "nth-child": 222 return cssNthChild(false, false); 223 case "nth-last-child": 224 return cssNthChild(true, false); 225 case "nth-of-type": 226 return cssNthChild(false, true); 227 case "nth-last-of-type": 228 return cssNthChild(true, true); 229 case "first-child": 230 return new Evaluator.IsFirstChild(); 231 case "last-child": 232 return new Evaluator.IsLastChild(); 233 case "first-of-type": 234 return new Evaluator.IsFirstOfType(); 235 case "last-of-type": 236 return new Evaluator.IsLastOfType(); 237 case "only-child": 238 return new Evaluator.IsOnlyChild(); 239 case "only-of-type": 240 return new Evaluator.IsOnlyOfType(); 241 case "empty": 242 return new Evaluator.IsEmpty(); 243 case "root": 244 return new Evaluator.IsRoot(); 245 case "matchText": 246 return new Evaluator.MatchText(); 247 default: 248 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 249 } 250 } 251 252 private Evaluator byId() { 253 String id = tq.consumeCssIdentifier(); 254 Validate.notEmpty(id); 255 return new Evaluator.Id(id); 256 } 257 258 private Evaluator byClass() { 259 String className = tq.consumeCssIdentifier(); 260 Validate.notEmpty(className); 261 return new Evaluator.Class(className.trim()); 262 } 263 264 private Evaluator byTag() { 265 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 266 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 267 // consistency - both the selector and the element tag 268 String tagName = normalize(tq.consumeElementSelector()); 269 Validate.notEmpty(tagName); 270 271 // namespaces: 272 if (tagName.startsWith("*|")) { // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 273 String plainTag = tagName.substring(2); // strip *| 274 return new CombiningEvaluator.Or( 275 new Evaluator.Tag(plainTag), 276 new Evaluator.TagEndsWith(":" + plainTag) 277 ); 278 } else if (tagName.endsWith("|*")) { // ns|* 279 String ns = tagName.substring(0, tagName.length() - 2) + ":"; // strip |*, to ns: 280 return new Evaluator.TagStartsWith(ns); 281 } else if (tagName.contains("|")) { // flip "abc|def" to "abc:def" 282 tagName = tagName.replace("|", ":"); 283 } 284 285 return new Evaluator.Tag(tagName); 286 } 287 288 private Evaluator byAttribute() { 289 TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue 290 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 291 Validate.notEmpty(key); 292 cq.consumeWhitespace(); 293 final Evaluator eval; 294 295 if (cq.isEmpty()) { 296 if (key.startsWith("^")) 297 eval = new Evaluator.AttributeStarting(key.substring(1)); 298 else if (key.equals("*")) // any attribute 299 eval = new Evaluator.AttributeStarting(""); 300 else 301 eval = new Evaluator.Attribute(key); 302 } else { 303 if (cq.matchChomp('=')) 304 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 305 else if (cq.matchChomp("!=")) 306 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 307 else if (cq.matchChomp("^=")) 308 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 309 else if (cq.matchChomp("$=")) 310 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 311 else if (cq.matchChomp("*=")) 312 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 313 else if (cq.matchChomp("~=")) 314 eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())); 315 else 316 throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 317 } 318 return eval; 319 } 320 321 //pseudo selectors :first-child, :last-child, :nth-child, ... 322 private static final Pattern NthStepOffset = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 323 private static final Pattern NthOffset = Pattern.compile("([+-])?(\\d+)"); 324 325 private Evaluator cssNthChild(boolean last, boolean ofType) { 326 String arg = normalize(consumeParens()); // arg is like "odd", or "-n+2", within nth-child(odd) 327 final int step, offset; 328 if ("odd".equals(arg)) { 329 step = 2; 330 offset = 1; 331 } else if ("even".equals(arg)) { 332 step = 2; 333 offset = 0; 334 } else { 335 Matcher stepOffsetM, stepM; 336 if ((stepOffsetM = NthStepOffset.matcher(arg)).matches()) { 337 if (stepOffsetM.group(3) != null) // has digits, like 3n+2 or -3n+2 338 step = Integer.parseInt(stepOffsetM.group(1).replaceFirst("^\\+", "")); 339 else // no digits, might be like n+2, or -n+2. if group(2) == "-", it’s -1; 340 step = "-".equals(stepOffsetM.group(2)) ? -1 : 1; 341 offset = 342 stepOffsetM.group(4) != null ? Integer.parseInt(stepOffsetM.group(4).replaceFirst("^\\+", "")) : 0; 343 } else if ((stepM = NthOffset.matcher(arg)).matches()) { 344 step = 0; 345 offset = Integer.parseInt(stepM.group().replaceFirst("^\\+", "")); 346 } else { 347 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 348 } 349 } 350 351 return ofType 352 ? (last ? new Evaluator.IsNthLastOfType(step, offset) : new Evaluator.IsNthOfType(step, offset)) 353 : (last ? new Evaluator.IsNthLastChild(step, offset) : new Evaluator.IsNthChild(step, offset)); 354 } 355 356 private String consumeParens() { 357 return tq.chompBalanced('(', ')'); 358 } 359 360 private int consumeIndex() { 361 String index = consumeParens().trim(); 362 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 363 return Integer.parseInt(index); 364 } 365 366 // pseudo selector :has(el) 367 private Evaluator has() { 368 return parseNested(StructuralEvaluator.Has::new, ":has() must have a selector"); 369 } 370 371 // pseudo selector :is() 372 private Evaluator is() { 373 return parseNested(StructuralEvaluator.Is::new, ":is() must have a selector"); 374 } 375 376 private Evaluator parseNested(Function<Evaluator, Evaluator> func, String err) { 377 Validate.isTrue(tq.matchChomp('('), err); 378 Evaluator eval = parseSelectorGroup(); 379 Validate.isTrue(tq.matchChomp(')'), err); 380 return func.apply(eval); 381 } 382 383 // pseudo selector :contains(text), containsOwn(text) 384 private Evaluator contains(boolean own) { 385 String query = own ? ":containsOwn" : ":contains"; 386 String searchText = TokenQueue.unescape(consumeParens()); 387 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 388 return own 389 ? new Evaluator.ContainsOwnText(searchText) 390 : new Evaluator.ContainsText(searchText); 391 } 392 393 private Evaluator containsWholeText(boolean own) { 394 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 395 String searchText = TokenQueue.unescape(consumeParens()); 396 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 397 return own 398 ? new Evaluator.ContainsWholeOwnText(searchText) 399 : new Evaluator.ContainsWholeText(searchText); 400 } 401 402 // pseudo selector :containsData(data) 403 private Evaluator containsData() { 404 String searchText = TokenQueue.unescape(consumeParens()); 405 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 406 return new Evaluator.ContainsData(searchText); 407 } 408 409 // :matches(regex), matchesOwn(regex) 410 private Evaluator matches(boolean own) { 411 String query = own ? ":matchesOwn" : ":matches"; 412 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 413 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 414 415 return own 416 ? new Evaluator.MatchesOwn(Pattern.compile(regex)) 417 : new Evaluator.Matches(Pattern.compile(regex)); 418 } 419 420 // :matches(regex), matchesOwn(regex) 421 private Evaluator matchesWholeText(boolean own) { 422 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 423 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 424 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 425 426 return own 427 ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex)) 428 : new Evaluator.MatchesWholeText(Pattern.compile(regex)); 429 } 430 431 // :not(selector) 432 private Evaluator not() { 433 String subQuery = consumeParens(); 434 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 435 436 return new StructuralEvaluator.Not(parse(subQuery)); 437 } 438 439 @Override 440 public String toString() { 441 return query; 442 } 443}