001package org.jsoup.parser;
002
003import org.jsoup.Connection;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.nodes.Node;
008import org.jsoup.select.Evaluator;
009import org.jsoup.select.NodeVisitor;
010import org.jsoup.select.Selector;
011import org.jspecify.annotations.Nullable;
012
013import java.io.Closeable;
014import java.io.IOException;
015import java.io.Reader;
016import java.io.StringReader;
017import java.io.UncheckedIOException;
018import java.util.Iterator;
019import java.util.LinkedList;
020import java.util.List;
021import java.util.NoSuchElementException;
022import java.util.Queue;
023import java.util.Spliterator;
024import java.util.Spliterators;
025import java.util.stream.Stream;
026import java.util.stream.StreamSupport;
027
028/**
029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or
030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if
031 applicable.
032 <p>To conserve memory, you can {@link Node#remove() remove()} Elements (or their children) from the DOM during the
033 parse. This provides a mechanism to parse an input document that would otherwise be too large to fit into memory, yet
034 still providing a DOM interface to the document and its elements.</p>
035 <p>
036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will
037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another
038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods.
039 </p>
040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be
041 read, call {@link #stop()} and {@link #close()}.</p>
042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete
043 until the input is fully consumed.</p>
044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs.
045 New parsers should be used in each thread.</p>
046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and
047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p>
048 <p>For examples, see the jsoup
049 <a href="https://jsoup.org/cookbook/input/streamparser-dom-sax">StreamParser cookbook.</a></p>
050 @since 1.18.1
051 */
052public class StreamParser implements Closeable {
053    final private Parser parser;
054    final private TreeBuilder treeBuilder;
055    final private ElementIterator it = new ElementIterator();
056    @Nullable private Document document;
057    private boolean stopped = false;
058
059    /**
060     Construct a new StreamParser, using the supplied base Parser.
061     @param parser the configured base parser
062     */
063    public StreamParser(Parser parser) {
064        this.parser = parser;
065        treeBuilder = parser.getTreeBuilder();
066        treeBuilder.nodeListener(it);
067    }
068
069    /**
070     Provide the input for a Document parse. The input is not read until a consuming operation is called.
071     @param input the input to be read.
072     @param baseUri the URL of this input, for absolute link resolution
073     @return this parser, for chaining
074     */
075    public StreamParser parse(Reader input, String baseUri) {
076        close(); // probably a no-op, but ensures any previous reader is closed
077        it.reset();
078        treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error
079        document = treeBuilder.doc;
080        return this;
081    }
082
083    /**
084     Provide the input for a Document parse. The input is not read until a consuming operation is called.
085     @param input the input to be read
086     @param baseUri the URL of this input, for absolute link resolution
087     @return this parser
088     */
089    public StreamParser parse(String input, String baseUri) {
090        return parse(new StringReader(input), baseUri);
091    }
092
093    /**
094     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
095     @param input the input to be read
096     @param context the optional fragment context element
097     @param baseUri the URL of this input, for absolute link resolution
098     @return this parser
099     @see #completeFragment()
100     */
101    public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) {
102        parse(input, baseUri);
103        treeBuilder.initialiseParseFragment(context);
104        return this;
105    }
106
107    /**
108     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
109     @param input the input to be read
110     @param context the optional fragment context element
111     @param baseUri the URL of this input, for absolute link resolution
112     @return this parser
113     @see #completeFragment()
114     */
115    public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) {
116        return parseFragment(new StringReader(input), context, baseUri);
117    }
118
119    /**
120     Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each
121     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
122     (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as
123     each element is closed. That means that child elements will be returned prior to their parents.
124     <p>The stream will start from the current position of the backing iterator and the parse.</p>
125     <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a
126     SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p>
127     @return a stream of Element objects
128     @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods)
129     */
130    public Stream<Element> stream() {
131        return StreamSupport.stream(
132            Spliterators.spliteratorUnknownSize(
133                it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED),
134            false);
135    }
136
137    /**
138     Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each
139     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
140     (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as
141     each element is closed. That means that child elements will be returned prior to their parents.
142     <p>The iterator will start from the current position of the parse.</p>
143     <p>The iterator is backed by this StreamParser, and the resources it holds.</p>
144     @return a stream of Element objects
145     */
146    public Iterator<Element> iterator() {
147        //noinspection ReturnOfInnerClass
148        return it;
149    }
150
151    /**
152     Flags that the parse should be stopped; the backing iterator will not return any more Elements.
153     @return this parser
154     */
155    public StreamParser stop() {
156        stopped = true;
157        return this;
158    }
159
160    /**
161     Closes the input and releases resources including the underlying parser and reader.
162     <p>The parser will also be closed when the input is fully read.</p>
163     <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p>
164     */
165    @Override public void close() {
166        treeBuilder.completeParse(); // closes the reader, frees resources
167    }
168
169    /**
170     Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully
171     read. Structural changes (e.g. insert, remove) may be made to the Document contents.
172     @return the (partial) Document
173     */
174    public Document document() {
175        document = treeBuilder.doc;
176        Validate.notNull(document, "Must run parse() before calling.");
177        return document;
178    }
179
180    /**
181     Runs the parser until the input is fully read, and returns the completed Document.
182     @return the completed Document
183     @throws IOException if an I/O error occurs
184     */
185    public Document complete() throws IOException {
186        Document doc = document();
187        treeBuilder.runParser();
188        return doc;
189    }
190
191    /**
192     When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed
193     fragment child nodes.
194     @return the completed child nodes
195     @throws IOException if an I/O error occurs
196     @see #parseFragment(Reader, Element, String)
197     */
198    public List<Node> completeFragment() throws IOException {
199        treeBuilder.runParser();
200        return treeBuilder.completeParseFragment();
201    }
202
203    /**
204     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
205     input will be parsed until the first match is found, or the input is completely read.
206     @param query the {@link org.jsoup.select.Selector} query.
207     @return the first matching {@link Element}, or {@code null} if there's no match
208     @throws IOException if an I/O error occurs
209     @see #selectFirst(Evaluator)
210     */
211    public @Nullable Element selectFirst(String query) throws IOException {
212        return selectFirst(Selector.evaluatorOf(query));
213    }
214
215    /**
216     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
217     is useful if you want to simply abort processing on a failed match.
218     @param query the {@link org.jsoup.select.Selector} query.
219     @return the first matching element
220     @throws IllegalArgumentException if no match is found
221     @throws IOException if an I/O error occurs
222     */
223    public Element expectFirst(String query) throws IOException {
224        return Validate.expectNotNull(
225            selectFirst(query),
226            "No elements matched the query '%s' in the document."
227            , query
228        );
229    }
230
231    /**
232     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
233     input will be parsed until the first match is found, or the input is completely read.
234     <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same
235     query against multiple documents.</p>
236     @param eval the {@link org.jsoup.select.Selector} evaluator.
237     @return the first matching {@link Element}, or {@code null} if there's no match
238     @throws IOException if an I/O error occurs
239     @see Selector#evaluatorOf(String css)
240     */
241    public @Nullable Element selectFirst(Evaluator eval) throws IOException {
242        final Document doc = document();
243
244        // run the query on the existing (partial) doc first, as there may be a hit already parsed
245        Element first = doc.selectFirst(eval);
246        if (first != null) return first;
247
248        return selectNext(eval);
249    }
250
251    /**
252     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
253     the input is completely read.
254     @param query the {@link org.jsoup.select.Selector} query.
255     @return the next matching {@link Element}, or {@code null} if there's no match
256     @throws IOException if an I/O error occurs
257     @see #selectNext(Evaluator)
258     */
259    public @Nullable Element selectNext(String query) throws IOException {
260        return selectNext(Selector.evaluatorOf(query));
261    }
262
263    /**
264     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
265     is useful if you want to simply abort processing on a failed match.
266     @param query the {@link org.jsoup.select.Selector} query.
267     @return the first matching element
268     @throws IllegalArgumentException if no match is found
269     @throws IOException if an I/O error occurs
270     */
271    public Element expectNext(String query) throws IOException {
272        return Validate.expectNotNull(
273            selectNext(query),
274            "No elements matched the query '%s' in the document."
275            , query
276        );
277    }
278
279    /**
280     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
281     the input is completely read.
282     <p>By providing a compiled evaluator vs a CSS selector, this method may be more efficient when executing the same
283     query against multiple documents.</p>
284     @param eval the {@link org.jsoup.select.Selector} evaluator.
285     @return the next matching {@link Element}, or {@code null} if there's no match
286     @throws IOException if an I/O error occurs
287     @see Selector#evaluatorOf(String css)
288     */
289    public @Nullable Element selectNext(Evaluator eval) throws IOException {
290        try {
291            final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream
292            return stream()
293                .filter(eval.asPredicate(doc))
294                .findFirst()
295                .orElse(null);
296        } catch (UncheckedIOException e) {
297            // Reader threw an IO exception emitted via Iterator's next()
298            throw e.getCause();
299        }
300    }
301
302    final class ElementIterator implements Iterator<Element>, NodeVisitor {
303        // listeners add to a next emit queue, as a single token read step may yield multiple elements
304        final private Queue<Element> emitQueue = new LinkedList<>();
305        private @Nullable Element current;  // most recently emitted
306        private @Nullable Element next;     // element waiting to be picked up
307        private @Nullable Element tail;     // The last tailed element (</html>), on hold for final pop
308
309        void reset() {
310            emitQueue.clear();
311            current = next = tail = null;
312            stopped = false;
313        }
314
315        // Iterator Interface:
316        /**
317         {@inheritDoc}
318         @throws UncheckedIOException if the underlying Reader errors during a read
319         */
320        @Override public boolean hasNext() {
321            maybeFindNext();
322            return next != null;
323        }
324
325        /**
326         {@inheritDoc}
327         @throws UncheckedIOException if the underlying Reader errors during a read
328         */
329        @Override public Element next() {
330            maybeFindNext();
331            if (next == null) throw new NoSuchElementException();
332            current = next;
333            next = null;
334            return current;
335        }
336
337        private void maybeFindNext() {
338            if (stopped || next != null) return;
339
340            // drain the current queue before stepping to get more
341            if (!emitQueue.isEmpty()) {
342                next = emitQueue.remove();
343                return;
344            }
345
346            // step the parser, which will hit the node listeners to add to the queue:
347            while (treeBuilder.stepParser()) {
348                if (!emitQueue.isEmpty()) {
349                    next = emitQueue.remove();
350                    return;
351                }
352            }
353            stop();
354            close();
355
356            // send the final element out:
357            if (tail != null) {
358                next = tail;
359                tail = null;
360            }
361        }
362
363        @Override public void remove() {
364            if (current == null) throw new NoSuchElementException();
365            current.remove();
366        }
367
368        // NodeVisitor Interface:
369        @Override public void head(Node node, int depth) {
370            if (node instanceof Element) {
371                Element prev = node.previousElementSibling();
372                // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail
373                if (prev != null) emitQueue.add(prev);
374            }
375        }
376
377        @Override public void tail(Node node, int depth) {
378            if (node instanceof Element) {
379                tail = (Element) node; // kept for final hit
380                Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that:
381                if (lastChild != null) emitQueue.add(lastChild);
382            }
383        }
384    }
385}