001package org.jsoup.parser;
002
003import org.jsoup.Connection;
004import org.jsoup.helper.Validate;
005import org.jsoup.nodes.Document;
006import org.jsoup.nodes.Element;
007import org.jsoup.nodes.Node;
008import org.jsoup.select.Evaluator;
009import org.jsoup.select.NodeVisitor;
010import org.jsoup.select.QueryParser;
011import org.jspecify.annotations.Nullable;
012
013import java.io.Closeable;
014import java.io.IOException;
015import java.io.Reader;
016import java.io.StringReader;
017import java.io.UncheckedIOException;
018import java.util.Iterator;
019import java.util.LinkedList;
020import java.util.List;
021import java.util.NoSuchElementException;
022import java.util.Queue;
023import java.util.Spliterator;
024import java.util.Spliterators;
025import java.util.stream.Stream;
026import java.util.stream.StreamSupport;
027
028/**
029 A StreamParser provides a progressive parse of its input. As each Element is completed, it is emitted via a Stream or
030 Iterator interface. Elements returned will be complete with all their children, and an (empty) next sibling, if
031 applicable.
032 <p>Elements (or their children) may be removed from the DOM during the parse, for e.g. to conserve memory, providing a
033 mechanism to parse an input document that would otherwise be too large to fit into memory, yet still providing a DOM
034 interface to the document and its elements.</p>
035 <p>
036 Additionally, the parser provides a {@link #selectFirst(String query)} / {@link #selectNext(String query)}, which will
037 run the parser until a hit is found, at which point the parse is suspended. It can be resumed via another
038 {@code select()} call, or via the {@link #stream()} or {@link #iterator()} methods.
039 </p>
040 <p>Once the input has been fully read, the input Reader will be closed. Or, if the whole document does not need to be
041 read, call {@link #stop()} and {@link #close()}.</p>
042 <p>The {@link #document()} method will return the Document being parsed into, which will be only partially complete
043 until the input is fully consumed.</p>
044 <p>A StreamParser can be reused via a new {@link #parse(Reader, String)}, but is not thread-safe for concurrent inputs.
045 New parsers should be used in each thread.</p>
046 <p>If created via {@link Connection.Response#streamParser()}, or another Reader that is I/O backed, the iterator and
047 stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.</p>
048 <p>The StreamParser interface is currently in <b>beta</b> and may change in subsequent releases. Feedback on the
049 feature and how you're using it is very welcome via the <a href="https://jsoup.org/discussion">jsoup
050 discussions</a>.</p>
051 @since 1.18.1
052 */
053public class StreamParser implements Closeable {
054    final private Parser parser;
055    final private TreeBuilder treeBuilder;
056    final private ElementIterator it = new ElementIterator();
057    @Nullable private Document document;
058    private boolean stopped = false;
059
060    /**
061     Construct a new StreamParser, using the supplied base Parser.
062     @param parser the configured base parser
063     */
064    public StreamParser(Parser parser) {
065        this.parser = parser;
066        treeBuilder = parser.getTreeBuilder();
067        treeBuilder.nodeListener(it);
068    }
069
070    /**
071     Provide the input for a Document parse. The input is not read until a consuming operation is called.
072     @param input the input to be read.
073     @param baseUri the URL of this input, for absolute link resolution
074     @return this parser, for chaining
075     */
076    public StreamParser parse(Reader input, String baseUri) {
077        close(); // probably a no-op, but ensures any previous reader is closed
078        it.reset();
079        treeBuilder.initialiseParse(input, baseUri, parser); // reader is not read, so no chance of IO error
080        document = treeBuilder.doc;
081        return this;
082    }
083
084    /**
085     Provide the input for a Document parse. The input is not read until a consuming operation is called.
086     @param input the input to be read
087     @param baseUri the URL of this input, for absolute link resolution
088     @return this parser
089     */
090    public StreamParser parse(String input, String baseUri) {
091        return parse(new StringReader(input), baseUri);
092    }
093
094    /**
095     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
096     @param input the input to be read
097     @param context the optional fragment context element
098     @param baseUri the URL of this input, for absolute link resolution
099     @return this parser
100     @see #completeFragment()
101     */
102    public StreamParser parseFragment(Reader input, @Nullable Element context, String baseUri) {
103        parse(input, baseUri);
104        treeBuilder.initialiseParseFragment(context);
105        return this;
106    }
107
108    /**
109     Provide the input for a fragment parse. The input is not read until a consuming operation is called.
110     @param input the input to be read
111     @param context the optional fragment context element
112     @param baseUri the URL of this input, for absolute link resolution
113     @return this parser
114     @see #completeFragment()
115     */
116    public StreamParser parseFragment(String input, @Nullable Element context, String baseUri) {
117        return parseFragment(new StringReader(input), context, baseUri);
118    }
119
120    /**
121     Creates a {@link Stream} of {@link Element}s, with the input being parsed as each element is consumed. Each
122     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
123     (empty) sibling will exist at {@link Element#nextElementSibling()}). The stream will be emitted in document order as
124     each element is closed. That means that child elements will be returned prior to their parents.
125     <p>The stream will start from the current position of the backing iterator and the parse.</p>
126     <p>When consuming the stream, if the Reader that the Parser is reading throws an I/O exception (for example a
127     SocketTimeoutException), that will be emitted as an {@link UncheckedIOException}</p>
128     @return a stream of Element objects
129     @throws UncheckedIOException if the underlying Reader excepts during a read (in stream consuming methods)
130     */
131    public Stream<Element> stream() {
132        return StreamSupport.stream(
133            Spliterators.spliteratorUnknownSize(
134                it, Spliterator.DISTINCT | Spliterator.NONNULL | Spliterator.ORDERED),
135            false);
136    }
137
138    /**
139     Returns an {@link Iterator} of {@link Element}s, with the input being parsed as each element is consumed. Each
140     Element returned will be complete (that is, all of its children will be included, and if it has a next sibling, that
141     (empty) sibling will exist at {@link Element#nextElementSibling()}). The elements will be emitted in document order as
142     each element is closed. That means that child elements will be returned prior to their parents.
143     <p>The iterator will start from the current position of the parse.</p>
144     <p>The iterator is backed by this StreamParser, and the resources it holds.</p>
145     @return a stream of Element objects
146     */
147    public Iterator<Element> iterator() {
148        //noinspection ReturnOfInnerClass
149        return it;
150    }
151
152    /**
153     Flags that the parse should be stopped; the backing iterator will not return any more Elements.
154     @return this parser
155     */
156    public StreamParser stop() {
157        stopped = true;
158        return this;
159    }
160
161    /**
162     Closes the input and releases resources including the underlying parser and reader.
163     <p>The parser will also be closed when the input is fully read.</p>
164     <p>The parser can be reused with another call to {@link #parse(Reader, String)}.</p>
165     */
166    @Override public void close() {
167        treeBuilder.completeParse(); // closes the reader, frees resources
168    }
169
170    /**
171     Get the current {@link Document} as it is being parsed. It will be only partially complete until the input is fully
172     read. Structural changes (e.g. insert, remove) may be made to the Document contents.
173     @return the (partial) Document
174     */
175    public Document document() {
176        document = treeBuilder.doc;
177        Validate.notNull(document, "Must run parse() before calling.");
178        return document;
179    }
180
181    /**
182     Runs the parser until the input is fully read, and returns the completed Document.
183     @return the completed Document
184     @throws IOException if an I/O error occurs
185     */
186    public Document complete() throws IOException {
187        Document doc = document();
188        treeBuilder.runParser();
189        return doc;
190    }
191
192    /**
193     When initialized as a fragment parse, runs the parser until the input is fully read, and returns the completed
194     fragment child nodes.
195     @return the completed child nodes
196     @throws IOException if an I/O error occurs
197     @see #parseFragment(Reader, Element, String)
198     */
199    public List<Node> completeFragment() throws IOException {
200        treeBuilder.runParser();
201        return treeBuilder.completeParseFragment();
202    }
203
204    /**
205     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
206     input will be parsed until the first match is found, or the input is completely read.
207     @param query the {@link org.jsoup.select.Selector} query.
208     @return the first matching {@link Element}, or {@code null} if there's no match
209     @throws IOException if an I/O error occurs
210     */
211    public @Nullable Element selectFirst(String query) throws IOException {
212        return selectFirst(QueryParser.parse(query));
213    }
214
215    /**
216     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
217     is useful if you want to simply abort processing on a failed match.
218     @param query the {@link org.jsoup.select.Selector} query.
219     @return the first matching element
220     @throws IllegalArgumentException if no match is found
221     @throws IOException if an I/O error occurs
222     */
223    public Element expectFirst(String query) throws IOException {
224        return (Element) Validate.ensureNotNull(
225            selectFirst(query),
226            "No elements matched the query '%s' in the document."
227            , query
228        );
229    }
230
231    /**
232     Finds the first Element that matches the provided query. If the parsed Document does not already have a match, the
233     input will be parsed until the first match is found, or the input is completely read.
234     @param eval the {@link org.jsoup.select.Selector} evaluator.
235     @return the first matching {@link Element}, or {@code null} if there's no match
236     @throws IOException if an I/O error occurs
237     */
238    public @Nullable Element selectFirst(Evaluator eval) throws IOException {
239        final Document doc = document();
240
241        // run the query on the existing (partial) doc first, as there may be a hit already parsed
242        Element first = doc.selectFirst(eval);
243        if (first != null) return first;
244
245        return selectNext(eval);
246    }
247
248    /**
249     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
250     the input is completely read.
251     @param query the {@link org.jsoup.select.Selector} query.
252     @return the next matching {@link Element}, or {@code null} if there's no match
253     @throws IOException if an I/O error occurs
254     */
255    public @Nullable Element selectNext(String query) throws IOException {
256        return selectNext(QueryParser.parse(query));
257    }
258
259    /**
260     Just like {@link #selectFirst(String)}, but if there is no match, throws an {@link IllegalArgumentException}. This
261     is useful if you want to simply abort processing on a failed match.
262     @param query the {@link org.jsoup.select.Selector} query.
263     @return the first matching element
264     @throws IllegalArgumentException if no match is found
265     @throws IOException if an I/O error occurs
266     */
267    public Element expectNext(String query) throws IOException {
268        return (Element) Validate.ensureNotNull(
269            selectNext(query),
270            "No elements matched the query '%s' in the document."
271            , query
272        );
273    }
274
275    /**
276     Finds the next Element that matches the provided query. The input will be parsed until the next match is found, or
277     the input is completely read.
278     @param eval the {@link org.jsoup.select.Selector} evaluator.
279     @return the next matching {@link Element}, or {@code null} if there's no match
280     @throws IOException if an I/O error occurs
281     */
282    public @Nullable Element selectNext(Evaluator eval) throws IOException {
283        try {
284            final Document doc = document(); // validates the parse was initialized, keeps stack trace out of stream
285            return stream()
286                .filter(eval.asPredicate(doc))
287                .findFirst()
288                .orElse(null);
289        } catch (UncheckedIOException e) {
290            // Reader threw an IO exception emitted via Iterator's next()
291            throw e.getCause();
292        }
293    }
294
295    final class ElementIterator implements Iterator<Element>, NodeVisitor {
296        // listeners add to a next emit queue, as a single token read step may yield multiple elements
297        final private Queue<Element> emitQueue = new LinkedList<>();
298        private @Nullable Element current;  // most recently emitted
299        private @Nullable Element next;     // element waiting to be picked up
300        private @Nullable Element tail;     // The last tailed element (</html>), on hold for final pop
301
302        void reset() {
303            emitQueue.clear();
304            current = next = tail = null;
305            stopped = false;
306        }
307
308        // Iterator Interface:
309        /**
310         {@inheritDoc}
311         @throws UncheckedIOException if the underlying Reader errors during a read
312         */
313        @Override public boolean hasNext() {
314            maybeFindNext();
315            return next != null;
316        }
317
318        /**
319         {@inheritDoc}
320         @throws UncheckedIOException if the underlying Reader errors during a read
321         */
322        @Override public Element next() {
323            maybeFindNext();
324            if (next == null) throw new NoSuchElementException();
325            current = next;
326            next = null;
327            return current;
328        }
329
330        private void maybeFindNext() {
331            if (stopped || next != null) return;
332
333            // drain the current queue before stepping to get more
334            if (!emitQueue.isEmpty()) {
335                next = emitQueue.remove();
336                return;
337            }
338
339            // step the parser, which will hit the node listeners to add to the queue:
340            while (treeBuilder.stepParser()) {
341                if (!emitQueue.isEmpty()) {
342                    next = emitQueue.remove();
343                    return;
344                }
345            }
346            stop();
347            close();
348
349            // send the final element out:
350            if (tail != null) {
351                next = tail;
352                tail = null;
353            }
354        }
355
356        @Override public void remove() {
357            if (current == null) throw new NoSuchElementException();
358            current.remove();
359        }
360
361        // NodeVisitor Interface:
362        @Override public void head(Node node, int depth) {
363            if (node instanceof Element) {
364                Element prev = ((Element) node).previousElementSibling();
365                // We prefer to wait until an element has a next sibling before emitting it; otherwise, get it in tail
366                if (prev != null) emitQueue.add(prev);
367            }
368        }
369
370        @Override public void tail(Node node, int depth) {
371            if (node instanceof Element) {
372                tail = (Element) node; // kept for final hit
373                Element lastChild = tail.lastElementChild(); // won't get a nextsib, so emit that:
374                if (lastChild != null) emitQueue.add(lastChild);
375            }
376        }
377    }
378}
379
380
381