001package org.jsoup.select; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Element; 005import org.jsoup.parser.TokenQueue; 006import org.jspecify.annotations.Nullable; 007 008import java.util.Collection; 009import java.util.HashSet; 010import java.util.stream.Stream; 011 012/** 013 * CSS-like element selector, that finds elements matching a query. 014 * 015 * <h2>Selector syntax</h2> 016 * <p> 017 * A selector is a chain of simple selectors, separated by combinators. Selectors are <b>case insensitive</b> (including against 018 * elements, attributes, and attribute values). 019 * </p> 020 * <p> 021 * The universal selector {@code *} is implicit when no element selector is supplied (i.e. {@code .header} and {@code *.header} 022 * are equivalent). 023 * </p> 024 * <style>table.syntax tr td {vertical-align: top; padding-right: 2em; padding-top:0.5em; padding-bottom:0.5em; } table.syntax tr:hover{background-color: #eee;} table.syntax {border-spacing: 0px 0px;}</style> 025 * <table summary="" class="syntax"><colgroup><col span="1" style="width: 20%;"><col span="1" style="width: 40%;"><col span="1" style="width: 40%;"></colgroup> 026 * <tr><th align="left">Pattern</th><th align="left">Matches</th><th align="left">Example</th></tr> 027 * <tr><td><code>*</code></td><td>any element</td><td><code>*</code></td></tr> 028 * <tr><td><code>tag</code></td><td>elements with the given tag name</td><td><code>div</code></td></tr> 029 * <tr><td><code>*|E</code></td><td>elements of type E in any namespace (including non-namespaced)</td><td><code>*|name</code> finds <code><dc:name></code> and <code><name></code> elements</td></tr> 030 * <tr><td><code>ns|E</code></td><td>elements of type E in the namespace <i>ns</i></td><td><code>dc|name</code> finds <code><dc:name></code> elements</td></tr> 031 * <tr><td><code>ns|*</code></td><td>all elements in the namespace <i>ns</i></td><td><code>dc|*</code> finds <code><dc:p></code> and <code><dc:img></code>elements</td></tr> 032 * <tr><td><code>#id</code></td><td>elements with attribute ID of "id"</td><td><code>div#wrap</code>, <code>#logo</code></td></tr> 033 * <tr><td><code>.class</code></td><td>elements with a class name of "class"</td><td><code>div.left</code>, <code>.result</code></td></tr> 034 * <tr><td><code>[attr]</code></td><td>elements with an attribute named "attr" (with any value)</td><td><code>a[href]</code>, <code>[title]</code></td></tr> 035 * <tr><td><code>[^attrPrefix]</code></td><td>elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets</td><td><code>[^data-]</code>, <code>div[^data-]</code></td></tr> 036 * <tr><td><code>[attr=val]</code></td><td>elements with an attribute named "attr", and value equal to "val"</td><td><code>img[width=500]</code>, <code>a[rel=nofollow]</code></td></tr> 037 * <tr><td><code>[attr="val"]</code></td><td>elements with an attribute named "attr", and value equal to "val"</td><td><code>span[hello="Cleveland"][goodbye="Columbus"]</code>, <code>a[rel="nofollow"]</code></td></tr> 038 * <tr><td><code>[attr^=valPrefix]</code></td><td>elements with an attribute named "attr", and value starting with "valPrefix"</td><td><code>a[href^=http:]</code></td></tr> 039 * <tr><td><code>[attr$=valSuffix]</code></td><td>elements with an attribute named "attr", and value ending with "valSuffix"</td><td><code>img[src$=.png]</code></td></tr> 040 * <tr><td><code>[attr*=valContaining]</code></td><td>elements with an attribute named "attr", and value containing "valContaining"</td><td><code>a[href*=/search/]</code></td></tr> 041 * <tr><td><code>[attr~=<em>regex</em>]</code></td><td>elements with an attribute named "attr", and value matching the regular expression</td><td><code>img[src~=(?i)\\.(png|jpe?g)]</code></td></tr> 042 * <tr><td><code>[*]</code></td><td>elements with any attribute</td><td><code>p[*]</code> finds <code>p</code> elements that have at least one attribute; <code>p:not([*])</code> finds those with no attributes</td></tr> 043 * <tr><td></td><td>The above may be combined in any order</td><td><code>div.header[title]</code></td></tr> 044 * <tr><td colspan="3"><h3>Combinators</h3></td></tr> 045 * <tr><td><code>E F</code></td><td>an F element descended from an E element</td><td><code>div a</code>, <code>.logo h1</code></td></tr> 046 * <tr><td><code>E {@literal >} F</code></td><td>an F direct child of E</td><td><code>ol {@literal >} li</code></td></tr> 047 * <tr><td><code>E + F</code></td><td>an F element immediately preceded by sibling E</td><td><code>li + li</code>, <code>div.head + div</code></td></tr> 048 * <tr><td><code>E ~ F</code></td><td>an F element preceded by sibling E</td><td><code>h1 ~ p</code></td></tr> 049 * <tr><td><code>E, F, G</code></td><td>all matching elements E, F, or G</td><td><code>a[href], div, h3</code></td></tr> 050 * <tr><td colspan="3"><h3>Pseudo selectors</h3></td></tr> 051 * <tr><td><code>:lt(<em>n</em>)</code></td><td>elements whose sibling index is less than <em>n</em></td><td><code>td:lt(3)</code> finds the first 3 cells of each row</td></tr> 052 * <tr><td><code>:gt(<em>n</em>)</code></td><td>elements whose sibling index is greater than <em>n</em></td><td><code>td:gt(1)</code> finds cells after skipping the first two</td></tr> 053 * <tr><td><code>:eq(<em>n</em>)</code></td><td>elements whose sibling index is equal to <em>n</em></td><td><code>td:eq(0)</code> finds the first cell of each row</td></tr> 054 * <tr><td><code>:has(<em>selector</em>)</code></td><td>elements that contains at least one element matching the <em>selector</em></td><td><code>div:has(p)</code> finds <code>div</code>s that contain <code>p</code> elements.<br><code>div:has(> a)</code> selects <code>div</code> elements that have at least one direct child <code>a</code> element.<br><code>section:has(h1, h2)</code> finds <code>section</code> elements that contain a <code>h1</code> or a <code>h2</code> element</td></tr> 055 * <tr><td><code>:is(<em>selector list</em>)</code></td><td>elements that match any of the selectors in the selector list</td><td><code>:is(h1, h2, h3, h4, h5, h6)</code> finds any heading element.<br><code>:is(section, article) > :is(h1, h2)</code> finds a <code>h1</code> or <code>h2</code> that is a direct child of a <code>section</code> or an <code>article</code></td></tr> 056 * <tr><td><code>:not(<em>selector</em>)</code></td><td>elements that do not match the <em>selector</em>. See also {@link Elements#not(String)}</td><td><code>div:not(.logo)</code> finds all divs that do not have the "logo" class.<p><code>div:not(:has(div))</code> finds divs that do not contain divs.</p></td></tr> 057 * <tr><td><code>:contains(<em>text</em>)</code></td><td>elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. The text is whitespace normalized. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:contains(jsoup)</code> finds p elements containing the text "jsoup".<p>{@code p:contains(hello \(there\) finds p elements containing the text "Hello (There)"}</p></td></tr> 058 * <tr><td><code>:containsOwn(<em>text</em>)</code></td><td>elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants.</td><td><code>p:containsOwn(jsoup)</code> finds p elements with own text "jsoup".</td></tr> 059 * <tr><td><code>:containsData(<em>data</em>)</code></td><td>elements that contains the specified <em>data</em>. The contents of {@code script} and {@code style} elements, and {@code comment} nodes (etc) are considered data nodes, not text nodes. The search is case insensitive. The data may appear in the found element, or any of its descendants.</td><td><code>script:contains(jsoup)</code> finds script elements containing the data "jsoup".</td></tr> 060 * <tr><td><code>:containsWholeText(<em>text</em>)</code></td><td>elements that contains the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, or any of its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeText(jsoup\nThe Java HTML Parser)</code> finds p elements containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr> 061 * <tr><td><code>:containsWholeOwnText(<em>text</em>)</code></td><td>elements that <b>directly</b> contain the specified <b>non-normalized</b> text. The search is case sensitive, and will match exactly against spaces and newlines found in the original input. The text may appear in the found element, but not in its descendants. <p>To find content that includes parentheses, escape those with a {@code \}.</p></td><td><code>p:containsWholeOwnText(jsoup\nThe Java HTML Parser)</code> finds p elements directly containing the text <code>"jsoup\nThe Java HTML Parser"</code> (and not other variations of whitespace or casing, as <code>:contains()</code> would. Note that {@code br} elements are presented as a newline.</p></td></tr> 062 * <tr><td><code>:matches(<em>regex</em>)</code></td><td>elements containing <b>whitespace normalized</b> text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matches(\\d+)</code> finds table cells containing digits. <code>div:matches((?i)login)</code> finds divs containing the text, case insensitively.</td></tr> 063 * <tr><td><code>:matchesWholeText(<em>regex</em>)</code></td><td>elements containing <b>non-normalized</b> whole text that matches the specified regular expression. The text may appear in the found element, or any of its descendants.</td><td><code>td:matchesWholeText(\\s{2,})</code> finds table cells a run of at least two space characters.</td></tr> 064 * <tr><td><code>:matchesWholeOwnText(<em>regex</em>)</code></td><td>elements whose own <b>non-normalized</b> whole text matches the specified regular expression. The text must appear in the found element, not any of its descendants.</td><td><code>td:matchesWholeOwnText(\n\\d+)</code> finds table cells directly containing digits following a neewline.</td></tr> 065 * <tr><td></td><td>The above may be combined in any order and with other selectors</td><td><code>.light:contains(name):eq(0)</code></td></tr> 066 * <tr><td><code>:matchText</code></td><td>treats text nodes as elements, and so allows you to match against and select text nodes.<p><b>Note</b> that using this selector will modify the DOM, so you may want to {@code clone} your document before using.</td><td>{@code p:matchText:firstChild} with input {@code <p>One<br />Two</p>} will return one {@link org.jsoup.nodes.PseudoTextElement} with text "{@code One}".</td></tr> 067 * <tr><td colspan="3"><h3>Structural pseudo selectors</h3></td></tr> 068 * <tr><td><code>:root</code></td><td>The element that is the root of the document. In HTML, this is the <code>html</code> element</td><td><code>:root</code></td></tr> 069 * <tr><td><code>:nth-child(<em>a</em>n+<em>b</em>)</code></td><td><p>elements that have <code><em>a</em>n+<em>b</em>-1</code> siblings <b>before</b> it in the document tree, for any positive integer or zero value of <code>n</code>, and has a parent element. For values of <code>a</code> and <code>b</code> greater than zero, this effectively divides the element's children into groups of a elements (the last group taking the remainder), and selecting the <em>b</em>th element of each group. For example, this allows the selectors to address every other row in a table, and could be used to alternate the color of paragraph text in a cycle of four. The <code>a</code> and <code>b</code> values must be integers (positive, negative, or zero). The index of the first child of an element is 1.</p> 070 * Additionally, <code>:nth-child()</code> supports <code>odd</code> and <code>even</code> as arguments. <code>odd</code> is the same as <code>2n+1</code>, and <code>even</code> is the same as <code>2n</code>.</td><td><code>tr:nth-child(2n+1)</code> finds every odd row of a table. <code>:nth-child(10n-1)</code> the 9th, 19th, 29th, etc, element. <code>li:nth-child(5)</code> the 5h li</td></tr> 071 * <tr><td><code>:nth-last-child(<em>a</em>n+<em>b</em>)</code></td><td>elements that have <code><em>a</em>n+<em>b</em>-1</code> siblings <b>after</b> it in the document tree. Otherwise like <code>:nth-child()</code></td><td><code>tr:nth-last-child(-n+2)</code> the last two rows of a table</td></tr> 072 * <tr><td><code>:nth-of-type(<em>a</em>n+<em>b</em>)</code></td><td>pseudo-class notation represents an element that has <code><em>a</em>n+<em>b</em>-1</code> siblings with the same expanded element name <em>before</em> it in the document tree, for any zero or positive integer value of n, and has a parent element</td><td><code>img:nth-of-type(2n+1)</code></td></tr> 073 * <tr><td><code>:nth-last-of-type(<em>a</em>n+<em>b</em>)</code></td><td>pseudo-class notation represents an element that has <code><em>a</em>n+<em>b</em>-1</code> siblings with the same expanded element name <em>after</em> it in the document tree, for any zero or positive integer value of n, and has a parent element</td><td><code>img:nth-last-of-type(2n+1)</code></td></tr> 074 * <tr><td><code>:first-child</code></td><td>elements that are the first child of some other element.</td><td><code>div {@literal >} p:first-child</code></td></tr> 075 * <tr><td><code>:last-child</code></td><td>elements that are the last child of some other element.</td><td><code>ol {@literal >} li:last-child</code></td></tr> 076 * <tr><td><code>:first-of-type</code></td><td>elements that are the first sibling of its type in the list of children of its parent element</td><td><code>dl dt:first-of-type</code></td></tr> 077 * <tr><td><code>:last-of-type</code></td><td>elements that are the last sibling of its type in the list of children of its parent element</td><td><code>tr {@literal >} td:last-of-type</code></td></tr> 078 * <tr><td><code>:only-child</code></td><td>elements that have a parent element and whose parent element have no other element children</td><td></td></tr> 079 * <tr><td><code>:only-of-type</code></td><td> an element that has a parent element and whose parent element has no other element children with the same expanded element name</td><td></td></tr> 080 * <tr><td><code>:empty</code></td><td>elements that contain no child elements or nodes, with the exception of blank text nodes, comments, XML declarations, and doctype declarations. In other words, it matches elements that are effectively empty of meaningful content.</td><td><code>li:not(:empty)</code></td></tr> 081 * </table> 082 * 083 * <p>A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using <b><code>Pattern.quote("regex")</code></b> for it to parse correctly through both the selector parser and the regex parser. E.g. <code>String query = "div:matches(" + Pattern.quote(regex) + ");"</code>.</p> 084 * <p><b>Escaping special characters:</b> to match a tag, ID, or other selector that does not follow the regular CSS syntax, the query must be escaped with the <code>\</code> character. For example, to match by ID {@code <p id="i.d">}, use {@code document.select("#i\\.d")}.</p> 085 * 086 * @see Element#select(String css) 087 * @see Elements#select(String css) 088 * @see Element#selectXpath(String xpath) 089 */ 090public class Selector { 091 // not instantiable 092 private Selector() {} 093 094 /** 095 Find Elements matching the CSS query. 096 097 @param query CSS selector 098 @param root root element to descend into 099 @return matching elements, empty if none 100 @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. 101 */ 102 public static Elements select(String query, Element root) { 103 Validate.notEmpty(query); 104 return select(QueryParser.parse(query), root); 105 } 106 107 /** 108 Find Elements matching the Evaluator. 109 110 @param evaluator CSS Evaluator 111 @param root root (context) element to start from 112 @return matching elements, empty if none 113 */ 114 public static Elements select(Evaluator evaluator, Element root) { 115 Validate.notNull(evaluator); 116 Validate.notNull(root); 117 return Collector.collect(evaluator, root); 118 } 119 120 /** 121 Finds a Stream of elements matching the CSS query. 122 123 @param query CSS selector 124 @param root root element to descend into 125 @return a Stream of matching elements, empty if none 126 @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. 127 @since 1.19.1 128 */ 129 public static Stream<Element> selectStream(String query, Element root) { 130 Validate.notEmpty(query); 131 return selectStream(QueryParser.parse(query), root); 132 } 133 134 /** 135 Finds a Stream of elements matching the evaluator. 136 137 @param evaluator CSS selector 138 @param root root element to descend into 139 @return matching elements, empty if none 140 @since 1.19.1 141 */ 142 public static Stream<Element> selectStream(Evaluator evaluator, Element root) { 143 Validate.notNull(evaluator); 144 Validate.notNull(root); 145 return Collector.stream(evaluator, root); 146 } 147 148 /** 149 Find elements matching the query, across multiple roots. Elements will be deduplicated (in the case of 150 overlapping hierarchies). 151 152 @param query CSS selector 153 @param roots root elements to descend into 154 @return matching elements, empty if none 155 */ 156 public static Elements select(String query, Iterable<Element> roots) { 157 Validate.notEmpty(query); 158 Validate.notNull(roots); 159 Evaluator evaluator = QueryParser.parse(query); 160 Elements elements = new Elements(); 161 HashSet<Element> seenElements = new HashSet<>(); // dedupe elements by identity, as .equals is == 162 163 for (Element root : roots) { 164 selectStream(evaluator, root) 165 .filter(seenElements::add) 166 .forEach(elements::add); 167 } 168 169 return elements; 170 } 171 172 // exclude set. package open so that Elements can implement .not() selector. 173 static Elements filterOut(Collection<Element> elements, Collection<Element> outs) { 174 Elements output = new Elements(); 175 for (Element el : elements) { 176 boolean found = false; 177 for (Element out : outs) { 178 if (el.equals(out)) { 179 found = true; 180 break; 181 } 182 } 183 if (!found) 184 output.add(el); 185 } 186 return output; 187 } 188 189 /** 190 Find the first Element that matches the query. 191 192 @param cssQuery CSS selector 193 @param root root element to descend into 194 @return the matching element, or <b>null</b> if none. 195 */ 196 public static @Nullable Element selectFirst(String cssQuery, Element root) { 197 Validate.notEmpty(cssQuery); 198 return Collector.findFirst(QueryParser.parse(cssQuery), root); 199 } 200 201 /** 202 Find the first element matching the query, across multiple roots. 203 204 @param cssQuery CSS selector 205 @param roots root elements to descend into 206 @return the first matching element, or {@code null} if none 207 @since 1.19.1 208 */ 209 public static @Nullable Element selectFirst(String cssQuery, Iterable<Element> roots) { 210 Validate.notEmpty(cssQuery); 211 Validate.notNull(roots); 212 Evaluator evaluator = QueryParser.parse(cssQuery); 213 214 for (Element root : roots) { 215 Element first = Collector.findFirst(evaluator, root); 216 if (first != null) return first; 217 } 218 219 return null; 220 } 221 222 /** 223 Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be 224 valid in a selector. 225 226 @see <a href="https://www.w3.org/TR/cssom-1/#serialize-an-identifier">CSS Object Model, serialize an identifier</a> 227 @since 1.20.1 228 */ 229 public static String escapeCssIdentifier(String in) { 230 return TokenQueue.escapeCssIdentifier(in); 231 } 232 233 /** 234 Consume a CSS identifier (ID or class) off the queue. 235 <p>Note: For backwards compatibility this method supports improperly formatted CSS identifiers, e.g. {@code 1} instead 236 of {@code \31}.</p> 237 238 @return The unescaped identifier. 239 @throws IllegalArgumentException if an invalid escape sequence was found. 240 @see <a href="https://www.w3.org/TR/css-syntax-3/#consume-name">CSS Syntax Module Level 3, Consume an ident sequence</a> 241 @see <a href="https://www.w3.org/TR/css-syntax-3/#typedef-ident-token">CSS Syntax Module Level 3, ident-token</a> 242 @since 1.20.1 243 */ 244 public static String unescapeCssIdentifier(String in) { 245 TokenQueue tq = new TokenQueue(in); 246 return tq.consumeCssIdentifier(); 247 } 248 249 public static class SelectorParseException extends IllegalStateException { 250 public SelectorParseException(String msg) { 251 super(msg); 252 } 253 254 public SelectorParseException(String msg, Object... msgArgs) { 255 super(String.format(msg, msgArgs)); 256 } 257 258 public SelectorParseException(Throwable cause, String msg, Object... msgArgs) { 259 super(String.format(msg, msgArgs), cause); 260 } 261 } 262}