001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Document;
005import org.jsoup.nodes.Element;
006import org.jsoup.nodes.Node;
007import org.jspecify.annotations.Nullable;
008
009import java.io.Reader;
010import java.io.StringReader;
011import java.util.List;
012import java.util.concurrent.locks.ReentrantLock;
013
014/**
015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
016 {@link org.jsoup.Jsoup}.
017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will
018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make
019 copies.</p>
020 */
021public class Parser implements Cloneable {
022    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
023    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
024    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
025    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
026
027    private TreeBuilder treeBuilder;
028    private ParseErrorList errors;
029    private ParseSettings settings;
030    private boolean trackPosition = false;
031    private @Nullable TagSet tagSet;
032    private final ReentrantLock lock = new ReentrantLock();
033
034    /**
035     * Create a new Parser, using the specified TreeBuilder
036     * @param treeBuilder TreeBuilder to use to parse input into Documents.
037     */
038    public Parser(TreeBuilder treeBuilder) {
039        this.treeBuilder = treeBuilder;
040        settings = treeBuilder.defaultSettings();
041        errors = ParseErrorList.noTracking();
042    }
043
044    /**
045     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
046     @return a copied parser
047     */
048    public Parser newInstance() {
049        return new Parser(this);
050    }
051
052    @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead
053    @Override
054    public Parser clone() {
055        return new Parser(this);
056    }
057
058    private Parser(Parser copy) {
059        treeBuilder = copy.treeBuilder.newInstance(); // because extended
060        errors = new ParseErrorList(copy.errors); // only copies size, not contents
061        settings = new ParseSettings(copy.settings);
062        trackPosition = copy.trackPosition;
063    }
064    
065    public Document parseInput(String html, String baseUri) {
066        return parseInput(new StringReader(html), baseUri);
067    }
068
069    public Document parseInput(Reader inputHtml, String baseUri) {
070        try {
071            lock.lock(); // using a lock vs synchronized to support loom threads
072            return treeBuilder.parse(inputHtml, baseUri, this);
073        } finally {
074            lock.unlock();
075        }
076    }
077
078    public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
079        return parseFragmentInput(new StringReader(fragment), context, baseUri);
080    }
081
082    public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
083        try {
084            lock.lock();
085            return treeBuilder.parseFragment(fragment, context, baseUri, this);
086        } finally {
087            lock.unlock();
088        }
089    }
090
091    // gets & sets
092    /**
093     * Get the TreeBuilder currently in use.
094     * @return current TreeBuilder.
095     */
096    public TreeBuilder getTreeBuilder() {
097        return treeBuilder;
098    }
099
100    /**
101     * Update the TreeBuilder used when parsing content.
102     * @param treeBuilder new TreeBuilder
103     * @return this, for chaining
104     * @deprecated unused method, will be removed in 1.21.1
105     */
106    @Deprecated public Parser setTreeBuilder(TreeBuilder treeBuilder) {
107        this.treeBuilder = treeBuilder;
108        treeBuilder.parser = this;
109        return this;
110    }
111
112    /**
113     * Check if parse error tracking is enabled.
114     * @return current track error state.
115     */
116    public boolean isTrackErrors() {
117        return errors.getMaxSize() > 0;
118    }
119
120    /**
121     * Enable or disable parse error tracking for the next parse.
122     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
123     * @return this, for chaining
124     */
125    public Parser setTrackErrors(int maxErrors) {
126        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
127        return this;
128    }
129
130    /**
131     * Retrieve the parse errors, if any, from the last parse.
132     * @return list of parse errors, up to the size of the maximum errors tracked.
133     * @see #setTrackErrors(int)
134     */
135    public ParseErrorList getErrors() {
136        return errors;
137    }
138
139    /**
140     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
141     source they were created from. By default, tracking is not enabled.
142     * @return current track position setting
143     */
144    public boolean isTrackPosition() {
145        return trackPosition;
146    }
147
148    /**
149     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
150     input source they were created from.
151     @param trackPosition position tracking setting; {@code true} to enable
152     @return this Parser, for chaining
153     */
154    public Parser setTrackPosition(boolean trackPosition) {
155        this.trackPosition = trackPosition;
156        return this;
157    }
158
159    /**
160     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
161     * @param settings the new settings
162     * @return this Parser
163     */
164    public Parser settings(ParseSettings settings) {
165        this.settings = settings;
166        return this;
167    }
168
169    /**
170     Gets the current ParseSettings for this Parser
171     * @return current ParseSettings
172     */
173    public ParseSettings settings() {
174        return settings;
175    }
176
177    /**
178     Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
179     parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
180     <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p>
181
182     @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet.
183     @return this Parser
184     @since 1.20.1
185     */
186    public Parser tagSet(TagSet tagSet) {
187        Validate.notNull(tagSet);
188        this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it
189        return this;
190    }
191
192    /**
193     Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set.
194     @return the current TagSet. After the parse, this will contain any new tags that were found in the document.
195     @since 1.20.1
196     */
197    public TagSet tagSet() {
198        if (tagSet == null)
199            tagSet = treeBuilder.defaultTagSet();
200        return tagSet;
201    }
202
203    /**
204     (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
205     Data Nodes).
206     @deprecated internal method, no longer used, and will be removed in 1.12.1.
207     */
208    @Deprecated public boolean isContentForTagData(String normalName) {
209        return tagSet().valueOf(normalName, defaultNamespace()).is(Tag.Data);
210    }
211
212    public String defaultNamespace() {
213        return getTreeBuilder().defaultNamespace();
214    }
215
216    // static parse functions below
217    /**
218     * Parse HTML into a Document.
219     *
220     * @param html HTML to parse
221     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
222     *
223     * @return parsed Document
224     */
225    public static Document parse(String html, String baseUri) {
226        TreeBuilder treeBuilder = new HtmlTreeBuilder();
227        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
228    }
229
230    /**
231     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
232     *
233     * @param fragmentHtml the fragment of HTML to parse
234     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
235     * provides stack context (for implicit element creation).
236     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
237     *
238     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
239     */
240    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
241        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
242        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder));
243    }
244
245    /**
246     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
247     *
248     * @param fragmentHtml the fragment of HTML to parse
249     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
250     * provides stack context (for implicit element creation).
251     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
252     * @param errorList list to add errors to
253     *
254     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
255     */
256    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
257        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
258        Parser parser = new Parser(treeBuilder);
259        parser.errors = errorList;
260        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser);
261    }
262
263    /**
264     * Parse a fragment of XML into a list of nodes.
265     *
266     * @param fragmentXml the fragment of XML to parse
267     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
268     * @return list of nodes parsed from the input XML.
269     */
270    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
271        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
272        return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder));
273    }
274
275    /**
276     * Parse a fragment of HTML into the {@code body} of a Document.
277     *
278     * @param bodyHtml fragment of HTML
279     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
280     *
281     * @return Document, with empty head, and HTML parsed into body
282     */
283    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
284        Document doc = Document.createShell(baseUri);
285        Element body = doc.body();
286        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
287        Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
288        for (int i = nodes.length - 1; i > 0; i--) {
289            nodes[i].remove();
290        }
291        for (Node node : nodes) {
292            body.appendChild(node);
293        }
294        return doc;
295    }
296
297    /**
298     * Utility method to unescape HTML entities from a string
299     * @param string HTML escaped string
300     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
301     * @return an unescaped string
302     */
303    public static String unescapeEntities(String string, boolean inAttribute) {
304        Parser parser = Parser.htmlParser();
305        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
306        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
307        return tokeniser.unescapeEntities(inAttribute);
308    }
309
310    // builders
311
312    /**
313     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
314     * based on a knowledge of the semantics of the incoming tags.
315     * @return a new HTML parser.
316     */
317    public static Parser htmlParser() {
318        return new Parser(new HtmlTreeBuilder());
319    }
320
321    /**
322     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
323     * rather creates a simple tree directly from the input.
324     * @return a new simple XML parser.
325     */
326    public static Parser xmlParser() {
327        return new Parser(new XmlTreeBuilder());
328    }
329}