001package org.jsoup.parser; 002 003import org.jsoup.Connection; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.nodes.Node; 008import org.jsoup.select.Evaluator; 009import org.jsoup.select.NodeVisitor; 010import org.jsoup.select.QueryParser; 011import org.jspecify.annotations.Nullable; 012 013import java.io.Closeable; 014import java.io.IOException; 015import java.io.Reader; 016import java.io.StringReader; 017import java.io.UncheckedIOException; 018import java.util.Iterator; 019import java.util.LinkedList; 020import java.util.List; 021import java.util.NoSuchElementException; 022import java.util.Queue; 023import java.util.Spliterator; 024import java.util.Spliterators; 025import java.util.stream.Stream; 026import java.util.stream.StreamSupport; 027 028/** 029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or 030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if 031 applicable. 032 <p>Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a 033 mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM 034 interface to the document and its elements.</p> 035 <p> 036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will 037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another 038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods. 039 </p> 040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be 041 read, call {@link #stop()} and {@link #close()}.</p> 042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete 043 until the input is fully consumed.</p> 044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. 045 New parsers should be used in each thread.</p> 046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and 047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p> 048 <p>The StreamParser interface is currently in <b>beta</b> and may change in subsequent releases. Feedback on the 049 feature and how you're using it is very welcome via the <a href="https://jsoup.org/discussion">jsoup 050 discussions</a>.</p> 051 @since 1.18.1 052 */ 053public class StreamParser implements Closeable { 054 final private Parser parser; 055 final private TreeBuilder treeBuilder; 056 final private ElementIterator it = new ElementIterator(); 057 @Nullable private Document document; 058 private boolean stopped = false; 059 060 /** 061 Construct a new StreamParser, using the supplied base Parser. 062 @param parser the configured base parser 063 */ 064 public StreamParser(Parser parser) { 065 this.parser = parser; 066 treeBuilder = parser.getTreeBuilder(); 067 treeBuilder.nodeListener(it); 068 } 069 070 /** 071 Provide the input for a Document parse. The input is not read until a consuming operation is called. 072 @param input the input to be read. 073 @param baseUri the URL of this input, for absolute link resolution 074 @return this parser, for chaining 075 */ 076 public StreamParser parse(Reader input, String baseUri) { 077 close(); // probably a no-op, but ensures any previous reader is closed 078 it.reset(); 079 treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error 080 document = treeBuilder.doc; 081 return this; 082 } 083 084 /** 085 Provide the input for a Document parse. The input is not read until a consuming operation is called. 086 @param input the input to be read 087 @param baseUri the URL of this input, for absolute link resolution 088 @return this parser 089 */ 090 public StreamParser parse(String input, String baseUri) { 091 return parse(new StringReader(input), baseUri); 092 } 093 094 /** 095 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 096 @param input the input to be read 097 @param context the optional fragment context element 098 @param baseUri the URL of this input, for absolute link resolution 099 @return this parser 100 @see #completeFragment() 101 */ 102 public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) { 103 parse(input, baseUri); 104 treeBuilder.initialiseParseFragment(context); 105 return this; 106 } 107 108 /** 109 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 110 @param input the input to be read 111 @param context the optional fragment context element 112 @param baseUri the URL of this input, for absolute link resolution 113 @return this parser 114 @see #completeFragment() 115 */ 116 public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) { 117 return parseFragment(new StringReader(input), context, baseUri); 118 } 119 120 /** 121 Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each 122 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 123 (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as 124 each element is closed. That means that child elements will be returned prior to their parents. 125 <p>The stream will start from the current position of the backing iterator and the parse.</p> 126 <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a 127 SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p> 128 @return a stream of Element objects 129 @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods) 130 */ 131 public Stream<Element> stream() { 132 return StreamSupport.stream( 133 Spliterators.spliteratorUnknownSize( 134 it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED), 135 false); 136 } 137 138 /** 139 Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each 140 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 141 (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as 142 each element is closed. That means that child elements will be returned prior to their parents. 143 <p>The iterator will start from the current position of the parse.</p> 144 <p>The iterator is backed by this StreamParser, and the resources it holds.</p> 145 @return a stream of Element objects 146 */ 147 public Iterator<Element> iterator() { 148 //noinspection ReturnOfInnerClass 149 return it; 150 } 151 152 /** 153 Flags that the parse should be stopped; the backing iterator will not return any more Elements. 154 @return this parser 155 */ 156 public StreamParser stop() { 157 stopped = true; 158 return this; 159 } 160 161 /** 162 Closes the input and releases resources including the underlying parser and reader. 163 <p>The parser will also be closed when the input is fully read.</p> 164 <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p> 165 */ 166 @Override public void close() { 167 treeBuilder.completeParse(); // closes the reader, frees resources 168 } 169 170 /** 171 Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully 172 read. Structural changes (e.g. insert, remove) may be made to the Document contents. 173 @return the (partial) Document 174 */ 175 public Document document() { 176 document = treeBuilder.doc; 177 Validate.notNull(document, "Must run parse() before calling."); 178 return document; 179 } 180 181 /** 182 Runs the parser until the input is fully read, and returns the completed Document. 183 @return the completed Document 184 @throws IOException if an I/O error occurs 185 */ 186 public Document complete() throws IOException { 187 Document doc = document(); 188 treeBuilder.runParser(); 189 return doc; 190 } 191 192 /** 193 When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed 194 fragment child nodes. 195 @return the completed child nodes 196 @throws IOException if an I/O error occurs 197 @see #parseFragment(Reader, Element, String) 198 */ 199 public List<Node> completeFragment() throws IOException { 200 treeBuilder.runParser(); 201 return treeBuilder.completeParseFragment(); 202 } 203 204 /** 205 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 206 input will be parsed until the first match is found, or the input is completely read. 207 @param query the {@link org.jsoup.select.Selector} query. 208 @return the first matching {@link Element}, or {@code null} if there's no match 209 @throws IOException if an I/O error occurs 210 */ 211 public @Nullable Element selectFirst(String query) throws IOException { 212 return selectFirst(QueryParser.parse(query)); 213 } 214 215 /** 216 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 217 is useful if you want to simply abort processing on a failed match. 218 @param query the {@link org.jsoup.select.Selector} query. 219 @return the first matching element 220 @throws IllegalArgumentException if no match is found 221 @throws IOException if an I/O error occurs 222 */ 223 public Element expectFirst(String query) throws IOException { 224 return (Element) Validate.ensureNotNull( 225 selectFirst(query), 226 "No elements matched the query '%s' in the document." 227 , query 228 ); 229 } 230 231 /** 232 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 233 input will be parsed until the first match is found, or the input is completely read. 234 @param eval the {@link org.jsoup.select.Selector} evaluator. 235 @return the first matching {@link Element}, or {@code null} if there's no match 236 @throws IOException if an I/O error occurs 237 */ 238 public @Nullable Element selectFirst(Evaluator eval) throws IOException { 239 final Document doc = document(); 240 241 // run the query on the existing (partial) doc first, as there may be a hit already parsed 242 Element first = doc.selectFirst(eval); 243 if (first != null) return first; 244 245 return selectNext(eval); 246 } 247 248 /** 249 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 250 the input is completely read. 251 @param query the {@link org.jsoup.select.Selector} query. 252 @return the next matching {@link Element}, or {@code null} if there's no match 253 @throws IOException if an I/O error occurs 254 */ 255 public @Nullable Element selectNext(String query) throws IOException { 256 return selectNext(QueryParser.parse(query)); 257 } 258 259 /** 260 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 261 is useful if you want to simply abort processing on a failed match. 262 @param query the {@link org.jsoup.select.Selector} query. 263 @return the first matching element 264 @throws IllegalArgumentException if no match is found 265 @throws IOException if an I/O error occurs 266 */ 267 public Element expectNext(String query) throws IOException { 268 return (Element) Validate.ensureNotNull( 269 selectNext(query), 270 "No elements matched the query '%s' in the document." 271 , query 272 ); 273 } 274 275 /** 276 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 277 the input is completely read. 278 @param eval the {@link org.jsoup.select.Selector} evaluator. 279 @return the next matching {@link Element}, or {@code null} if there's no match 280 @throws IOException if an I/O error occurs 281 */ 282 public @Nullable Element selectNext(Evaluator eval) throws IOException { 283 try { 284 final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream 285 return stream() 286 .filter(eval.asPredicate(doc)) 287 .findFirst() 288 .orElse(null); 289 } catch (UncheckedIOException e) { 290 // Reader threw an IO exception emitted via Iterator's next() 291 throw e.getCause(); 292 } 293 } 294 295 final class ElementIterator implements Iterator<Element>, NodeVisitor { 296 // listeners add to a next emit queue, as a single token read step may yield multiple elements 297 final private Queue<Element> emitQueue = new LinkedList<>(); 298 private @Nullable Element current; // most recently emitted 299 private @Nullable Element next; // element waiting to be picked up 300 private @Nullable Element tail; // The last tailed element (</html>), on hold for final pop 301 302 void reset() { 303 emitQueue.clear(); 304 current = next = tail = null; 305 stopped = false; 306 } 307 308 // Iterator Interface: 309 /** 310 {@inheritDoc} 311 @throws UncheckedIOException if the underlying Reader errors during a read 312 */ 313 @Override public boolean hasNext() { 314 maybeFindNext(); 315 return next != null; 316 } 317 318 /** 319 {@inheritDoc} 320 @throws UncheckedIOException if the underlying Reader errors during a read 321 */ 322 @Override public Element next() { 323 maybeFindNext(); 324 if (next == null) throw new NoSuchElementException(); 325 current = next; 326 next = null; 327 return current; 328 } 329 330 private void maybeFindNext() { 331 if (stopped || next != null) return; 332 333 // drain the current queue before stepping to get more 334 if (!emitQueue.isEmpty()) { 335 next = emitQueue.remove(); 336 return; 337 } 338 339 // step the parser, which will hit the node listeners to add to the queue: 340 while (treeBuilder.stepParser()) { 341 if (!emitQueue.isEmpty()) { 342 next = emitQueue.remove(); 343 return; 344 } 345 } 346 stop(); 347 close(); 348 349 // send the final element out: 350 if (tail != null) { 351 next = tail; 352 tail = null; 353 } 354 } 355 356 @Override public void remove() { 357 if (current == null) throw new NoSuchElementException(); 358 current.remove(); 359 } 360 361 // NodeVisitor Interface: 362 @Override public void head(Node node, int depth) { 363 if (node instanceof Element) { 364 Element prev = ((Element) node).previousElementSibling(); 365 // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail 366 if (prev != null) emitQueue.add(prev); 367 } 368 } 369 370 @Override public void tail(Node node, int depth) { 371 if (node instanceof Element) { 372 tail = (Element) node; // kept for final hit 373 Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that: 374 if (lastChild != null) emitQueue.add(lastChild); 375 } 376 } 377 } 378} 379 380 381