001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Document;
005import org.jsoup.nodes.Element;
006import org.jsoup.nodes.Node;
007import org.jspecify.annotations.Nullable;
008
009import java.io.Reader;
010import java.io.StringReader;
011import java.util.List;
012import java.util.concurrent.locks.ReentrantLock;
013
014/**
015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
016 {@link org.jsoup.Jsoup}.
017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will
018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make
019 copies.</p>
020 */
021public class Parser implements Cloneable {
022    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
023    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
024    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
025    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
026
027    private final TreeBuilder treeBuilder;
028    private ParseErrorList errors;
029    private ParseSettings settings;
030    private boolean trackPosition = false;
031    private @Nullable TagSet tagSet;
032    private final ReentrantLock lock = new ReentrantLock();
033
034    /**
035     * Create a new Parser, using the specified TreeBuilder
036     * @param treeBuilder TreeBuilder to use to parse input into Documents.
037     */
038    public Parser(TreeBuilder treeBuilder) {
039        this.treeBuilder = treeBuilder;
040        settings = treeBuilder.defaultSettings();
041        errors = ParseErrorList.noTracking();
042    }
043
044    /**
045     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
046     @return a copied parser
047     */
048    public Parser newInstance() {
049        return new Parser(this);
050    }
051
052    @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead
053    @Override
054    public Parser clone() {
055        return new Parser(this);
056    }
057
058    private Parser(Parser copy) {
059        treeBuilder = copy.treeBuilder.newInstance(); // because extended
060        errors = new ParseErrorList(copy.errors); // only copies size, not contents
061        settings = new ParseSettings(copy.settings);
062        trackPosition = copy.trackPosition;
063    }
064
065    /**
066     Parse the contents of a String.
067
068     @param html HTML to parse
069     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
070     @return parsed Document
071     */
072    public Document parseInput(String html, String baseUri) {
073        return parseInput(new StringReader(html), baseUri);
074    }
075
076    /**
077     Parse the contents of Reader.
078
079     @param inputHtml HTML to parse
080     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
081     @return parsed Document
082     @throws java.io.UncheckedIOException if an I/O error occurs in the Reader
083     */
084    public Document parseInput(Reader inputHtml, String baseUri) {
085        try {
086            lock.lock(); // using a lock vs synchronized to support loom threads
087            return treeBuilder.parse(inputHtml, baseUri, this);
088        } finally {
089            lock.unlock();
090        }
091    }
092
093    /**
094     Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
095
096     @param fragment the fragment of HTML to parse
097     @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML).
098     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
099     @return list of nodes parsed from the input HTML.
100     */
101    public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
102        return parseFragmentInput(new StringReader(fragment), context, baseUri);
103    }
104
105    /**
106     Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
107
108     @param fragment the fragment of HTML to parse
109     @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML).
110     @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
111     @return list of nodes parsed from the input HTML.
112     @throws java.io.UncheckedIOException if an I/O error occurs in the Reader
113     */
114    public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
115        try {
116            lock.lock();
117            return treeBuilder.parseFragment(fragment, context, baseUri, this);
118        } finally {
119            lock.unlock();
120        }
121    }
122
123    // gets & sets
124    /**
125     * Get the TreeBuilder currently in use.
126     * @return current TreeBuilder.
127     */
128    public TreeBuilder getTreeBuilder() {
129        return treeBuilder;
130    }
131
132    /**
133     * Check if parse error tracking is enabled.
134     * @return current track error state.
135     */
136    public boolean isTrackErrors() {
137        return errors.getMaxSize() > 0;
138    }
139
140    /**
141     * Enable or disable parse error tracking for the next parse.
142     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
143     * @return this, for chaining
144     */
145    public Parser setTrackErrors(int maxErrors) {
146        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
147        return this;
148    }
149
150    /**
151     * Retrieve the parse errors, if any, from the last parse.
152     * @return list of parse errors, up to the size of the maximum errors tracked.
153     * @see #setTrackErrors(int)
154     */
155    public ParseErrorList getErrors() {
156        return errors;
157    }
158
159    /**
160     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
161     source they were created from. By default, tracking is not enabled.
162     * @return current track position setting
163     */
164    public boolean isTrackPosition() {
165        return trackPosition;
166    }
167
168    /**
169     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
170     input source they were created from.
171     @param trackPosition position tracking setting; {@code true} to enable
172     @return this Parser, for chaining
173     */
174    public Parser setTrackPosition(boolean trackPosition) {
175        this.trackPosition = trackPosition;
176        return this;
177    }
178
179    /**
180     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
181     * @param settings the new settings
182     * @return this Parser
183     */
184    public Parser settings(ParseSettings settings) {
185        this.settings = settings;
186        return this;
187    }
188
189    /**
190     Gets the current ParseSettings for this Parser
191     * @return current ParseSettings
192     */
193    public ParseSettings settings() {
194        return settings;
195    }
196
197    /**
198     Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
199     parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
200     <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p>
201
202     @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet.
203     @return this Parser
204     @since 1.20.1
205     */
206    public Parser tagSet(TagSet tagSet) {
207        Validate.notNull(tagSet);
208        this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it
209        return this;
210    }
211
212    /**
213     Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set.
214     @return the current TagSet. After the parse, this will contain any new tags that were found in the document.
215     @since 1.20.1
216     */
217    public TagSet tagSet() {
218        if (tagSet == null)
219            tagSet = treeBuilder.defaultTagSet();
220        return tagSet;
221    }
222
223    public String defaultNamespace() {
224        return getTreeBuilder().defaultNamespace();
225    }
226
227    // static parse functions below
228    /**
229     * Parse HTML into a Document.
230     *
231     * @param html HTML to parse
232     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
233     *
234     * @return parsed Document
235     */
236    public static Document parse(String html, String baseUri) {
237        TreeBuilder treeBuilder = new HtmlTreeBuilder();
238        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
239    }
240
241    /**
242     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
243     *
244     * @param fragmentHtml the fragment of HTML to parse
245     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
246     * provides stack context (for implicit element creation).
247     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
248     *
249     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
250     */
251    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
252        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
253        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder));
254    }
255
256    /**
257     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
258     *
259     * @param fragmentHtml the fragment of HTML to parse
260     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
261     * provides stack context (for implicit element creation).
262     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
263     * @param errorList list to add errors to
264     *
265     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
266     */
267    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
268        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
269        Parser parser = new Parser(treeBuilder);
270        parser.errors = errorList;
271        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser);
272    }
273
274    /**
275     * Parse a fragment of XML into a list of nodes.
276     *
277     * @param fragmentXml the fragment of XML to parse
278     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
279     * @return list of nodes parsed from the input XML.
280     */
281    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
282        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
283        return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder));
284    }
285
286    /**
287     * Parse a fragment of HTML into the {@code body} of a Document.
288     *
289     * @param bodyHtml fragment of HTML
290     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
291     *
292     * @return Document, with empty head, and HTML parsed into body
293     */
294    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
295        Document doc = Document.createShell(baseUri);
296        Element body = doc.body();
297        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
298        Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
299        for (int i = nodes.length - 1; i > 0; i--) {
300            nodes[i].remove();
301        }
302        for (Node node : nodes) {
303            body.appendChild(node);
304        }
305        return doc;
306    }
307
308    /**
309     * Utility method to unescape HTML entities from a string
310     * @param string HTML escaped string
311     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
312     * @return an unescaped string
313     */
314    public static String unescapeEntities(String string, boolean inAttribute) {
315        Parser parser = Parser.htmlParser();
316        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
317        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
318        return tokeniser.unescapeEntities(inAttribute);
319    }
320
321    // builders
322
323    /**
324     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
325     * based on a knowledge of the semantics of the incoming tags.
326     * @return a new HTML parser.
327     */
328    public static Parser htmlParser() {
329        return new Parser(new HtmlTreeBuilder());
330    }
331
332    /**
333     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
334     * rather creates a simple tree directly from the input.
335     * @return a new simple XML parser.
336     */
337    public static Parser xmlParser() {
338        return new Parser(new XmlTreeBuilder());
339    }
340}