001package org.jsoup.parser; 002 003import org.jsoup.Connection; 004import org.jsoup.helper.Validate; 005import org.jsoup.nodes.Document; 006import org.jsoup.nodes.Element; 007import org.jsoup.nodes.Node; 008import org.jsoup.select.Evaluator; 009import org.jsoup.select.NodeVisitor; 010import org.jsoup.select.QueryParser; 011import org.jspecify.annotations.Nullable; 012 013import java.io.Closeable; 014import java.io.IOException; 015import java.io.Reader; 016import java.io.StringReader; 017import java.io.UncheckedIOException; 018import java.util.Iterator; 019import java.util.LinkedList; 020import java.util.List; 021import java.util.NoSuchElementException; 022import java.util.Queue; 023import java.util.Spliterator; 024import java.util.Spliterators; 025import java.util.stream.Stream; 026import java.util.stream.StreamSupport; 027 028/** 029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or 030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if 031 applicable. 032 <p>To conserve memory, you can {@link Node#remove() remove()} Elements (or their children) from the DOM during the 033 parse. This provides a mechanism to parse an input document that would otherwise be too large to fit into memory, yet 034 still providing a DOM interface to the document and its elements.</p> 035 <p> 036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will 037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another 038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods. 039 </p> 040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be 041 read, call {@link #stop()} and {@link #close()}.</p> 042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete 043 until the input is fully consumed.</p> 044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs. 045 New parsers should be used in each thread.</p> 046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and 047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p> 048 <p>For examples, see the jsoup 049 <a href="https://jsoup.org/cookbook/input/streamparser-dom-sax">StreamParser cookbook.</a></p> 050 @since 1.18.1 051 */ 052public class StreamParser implements Closeable { 053 final private Parser parser; 054 final private TreeBuilder treeBuilder; 055 final private ElementIterator it = new ElementIterator(); 056 @Nullable private Document document; 057 private boolean stopped = false; 058 059 /** 060 Construct a new StreamParser, using the supplied base Parser. 061 @param parser the configured base parser 062 */ 063 public StreamParser(Parser parser) { 064 this.parser = parser; 065 treeBuilder = parser.getTreeBuilder(); 066 treeBuilder.nodeListener(it); 067 } 068 069 /** 070 Provide the input for a Document parse. The input is not read until a consuming operation is called. 071 @param input the input to be read. 072 @param baseUri the URL of this input, for absolute link resolution 073 @return this parser, for chaining 074 */ 075 public StreamParser parse(Reader input, String baseUri) { 076 close(); // probably a no-op, but ensures any previous reader is closed 077 it.reset(); 078 treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error 079 document = treeBuilder.doc; 080 return this; 081 } 082 083 /** 084 Provide the input for a Document parse. The input is not read until a consuming operation is called. 085 @param input the input to be read 086 @param baseUri the URL of this input, for absolute link resolution 087 @return this parser 088 */ 089 public StreamParser parse(String input, String baseUri) { 090 return parse(new StringReader(input), baseUri); 091 } 092 093 /** 094 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 095 @param input the input to be read 096 @param context the optional fragment context element 097 @param baseUri the URL of this input, for absolute link resolution 098 @return this parser 099 @see #completeFragment() 100 */ 101 public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) { 102 parse(input, baseUri); 103 treeBuilder.initialiseParseFragment(context); 104 return this; 105 } 106 107 /** 108 Provide the input for a fragment parse. The input is not read until a consuming operation is called. 109 @param input the input to be read 110 @param context the optional fragment context element 111 @param baseUri the URL of this input, for absolute link resolution 112 @return this parser 113 @see #completeFragment() 114 */ 115 public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) { 116 return parseFragment(new StringReader(input), context, baseUri); 117 } 118 119 /** 120 Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each 121 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 122 (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as 123 each element is closed. That means that child elements will be returned prior to their parents. 124 <p>The stream will start from the current position of the backing iterator and the parse.</p> 125 <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a 126 SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p> 127 @return a stream of Element objects 128 @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods) 129 */ 130 public Stream<Element> stream() { 131 return StreamSupport.stream( 132 Spliterators.spliteratorUnknownSize( 133 it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED), 134 false); 135 } 136 137 /** 138 Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each 139 Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that 140 (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as 141 each element is closed. That means that child elements will be returned prior to their parents. 142 <p>The iterator will start from the current position of the parse.</p> 143 <p>The iterator is backed by this StreamParser, and the resources it holds.</p> 144 @return a stream of Element objects 145 */ 146 public Iterator<Element> iterator() { 147 //noinspection ReturnOfInnerClass 148 return it; 149 } 150 151 /** 152 Flags that the parse should be stopped; the backing iterator will not return any more Elements. 153 @return this parser 154 */ 155 public StreamParser stop() { 156 stopped = true; 157 return this; 158 } 159 160 /** 161 Closes the input and releases resources including the underlying parser and reader. 162 <p>The parser will also be closed when the input is fully read.</p> 163 <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p> 164 */ 165 @Override public void close() { 166 treeBuilder.completeParse(); // closes the reader, frees resources 167 } 168 169 /** 170 Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully 171 read. Structural changes (e.g. insert, remove) may be made to the Document contents. 172 @return the (partial) Document 173 */ 174 public Document document() { 175 document = treeBuilder.doc; 176 Validate.notNull(document, "Must run parse() before calling."); 177 return document; 178 } 179 180 /** 181 Runs the parser until the input is fully read, and returns the completed Document. 182 @return the completed Document 183 @throws IOException if an I/O error occurs 184 */ 185 public Document complete() throws IOException { 186 Document doc = document(); 187 treeBuilder.runParser(); 188 return doc; 189 } 190 191 /** 192 When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed 193 fragment child nodes. 194 @return the completed child nodes 195 @throws IOException if an I/O error occurs 196 @see #parseFragment(Reader, Element, String) 197 */ 198 public List<Node> completeFragment() throws IOException { 199 treeBuilder.runParser(); 200 return treeBuilder.completeParseFragment(); 201 } 202 203 /** 204 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 205 input will be parsed until the first match is found, or the input is completely read. 206 @param query the {@link org.jsoup.select.Selector} query. 207 @return the first matching {@link Element}, or {@code null} if there's no match 208 @throws IOException if an I/O error occurs 209 @see #selectFirst(Evaluator) 210 */ 211 public @Nullable Element selectFirst(String query) throws IOException { 212 return selectFirst(QueryParser.parse(query)); 213 } 214 215 /** 216 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 217 is useful if you want to simply abort processing on a failed match. 218 @param query the {@link org.jsoup.select.Selector} query. 219 @return the first matching element 220 @throws IllegalArgumentException if no match is found 221 @throws IOException if an I/O error occurs 222 */ 223 public Element expectFirst(String query) throws IOException { 224 return (Element) Validate.ensureNotNull( 225 selectFirst(query), 226 "No elements matched the query '%s' in the document." 227 , query 228 ); 229 } 230 231 /** 232 Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the 233 input will be parsed until the first match is found, or the input is completely read. 234 <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same 235 query against multiple documents.</p> 236 @param eval the {@link org.jsoup.select.Selector} evaluator. 237 @return the first matching {@link Element}, or {@code null} if there's no match 238 @throws IOException if an I/O error occurs 239 @see QueryParser#parse(String) 240 */ 241 public @Nullable Element selectFirst(Evaluator eval) throws IOException { 242 final Document doc = document(); 243 244 // run the query on the existing (partial) doc first, as there may be a hit already parsed 245 Element first = doc.selectFirst(eval); 246 if (first != null) return first; 247 248 return selectNext(eval); 249 } 250 251 /** 252 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 253 the input is completely read. 254 @param query the {@link org.jsoup.select.Selector} query. 255 @return the next matching {@link Element}, or {@code null} if there's no match 256 @throws IOException if an I/O error occurs 257 @see #selectNext(Evaluator) 258 */ 259 public @Nullable Element selectNext(String query) throws IOException { 260 return selectNext(QueryParser.parse(query)); 261 } 262 263 /** 264 Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This 265 is useful if you want to simply abort processing on a failed match. 266 @param query the {@link org.jsoup.select.Selector} query. 267 @return the first matching element 268 @throws IllegalArgumentException if no match is found 269 @throws IOException if an I/O error occurs 270 */ 271 public Element expectNext(String query) throws IOException { 272 return (Element) Validate.ensureNotNull( 273 selectNext(query), 274 "No elements matched the query '%s' in the document." 275 , query 276 ); 277 } 278 279 /** 280 Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or 281 the input is completely read. 282 <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same 283 query against multiple documents.</p> 284 @param eval the {@link org.jsoup.select.Selector} evaluator. 285 @return the next matching {@link Element}, or {@code null} if there's no match 286 @throws IOException if an I/O error occurs 287 @see QueryParser#parse(String) 288 */ 289 public @Nullable Element selectNext(Evaluator eval) throws IOException { 290 try { 291 final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream 292 return stream() 293 .filter(eval.asPredicate(doc)) 294 .findFirst() 295 .orElse(null); 296 } catch (UncheckedIOException e) { 297 // Reader threw an IO exception emitted via Iterator's next() 298 throw e.getCause(); 299 } 300 } 301 302 final class ElementIterator implements Iterator<Element>, NodeVisitor { 303 // listeners add to a next emit queue, as a single token read step may yield multiple elements 304 final private Queue<Element> emitQueue = new LinkedList<>(); 305 private @Nullable Element current; // most recently emitted 306 private @Nullable Element next; // element waiting to be picked up 307 private @Nullable Element tail; // The last tailed element (</html>), on hold for final pop 308 309 void reset() { 310 emitQueue.clear(); 311 current = next = tail = null; 312 stopped = false; 313 } 314 315 // Iterator Interface: 316 /** 317 {@inheritDoc} 318 @throws UncheckedIOException if the underlying Reader errors during a read 319 */ 320 @Override public boolean hasNext() { 321 maybeFindNext(); 322 return next != null; 323 } 324 325 /** 326 {@inheritDoc} 327 @throws UncheckedIOException if the underlying Reader errors during a read 328 */ 329 @Override public Element next() { 330 maybeFindNext(); 331 if (next == null) throw new NoSuchElementException(); 332 current = next; 333 next = null; 334 return current; 335 } 336 337 private void maybeFindNext() { 338 if (stopped || next != null) return; 339 340 // drain the current queue before stepping to get more 341 if (!emitQueue.isEmpty()) { 342 next = emitQueue.remove(); 343 return; 344 } 345 346 // step the parser, which will hit the node listeners to add to the queue: 347 while (treeBuilder.stepParser()) { 348 if (!emitQueue.isEmpty()) { 349 next = emitQueue.remove(); 350 return; 351 } 352 } 353 stop(); 354 close(); 355 356 // send the final element out: 357 if (tail != null) { 358 next = tail; 359 tail = null; 360 } 361 } 362 363 @Override public void remove() { 364 if (current == null) throw new NoSuchElementException(); 365 current.remove(); 366 } 367 368 // NodeVisitor Interface: 369 @Override public void head(Node node, int depth) { 370 if (node instanceof Element) { 371 Element prev = ((Element) node).previousElementSibling(); 372 // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail 373 if (prev != null) emitQueue.add(prev); 374 } 375 } 376 377 @Override public void tail(Node node, int depth) { 378 if (node instanceof Element) { 379 tail = (Element) node; // kept for final hit 380 Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that: 381 if (lastChild != null) emitQueue.add(lastChild); 382 } 383 } 384 } 385}