001package org.jsoup.parser;
002
003import org.jsoup.nodes.Document;
004import org.jsoup.nodes.Element;
005import org.jsoup.nodes.Node;
006import org.jspecify.annotations.Nullable;
007
008import java.io.Reader;
009import java.io.StringReader;
010import java.util.List;
011
012/**
013 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
014 {@link org.jsoup.Jsoup}.
015 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
016 environment, use {@link #newInstance()} to make copies. */
017public class Parser implements Cloneable {
018    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
019    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
020    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
021    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
022
023    private TreeBuilder treeBuilder;
024    private ParseErrorList errors;
025    private ParseSettings settings;
026    private boolean trackPosition = false;
027
028    /**
029     * Create a new Parser, using the specified TreeBuilder
030     * @param treeBuilder TreeBuilder to use to parse input into Documents.
031     */
032    public Parser(TreeBuilder treeBuilder) {
033        this.treeBuilder = treeBuilder;
034        settings = treeBuilder.defaultSettings();
035        errors = ParseErrorList.noTracking();
036    }
037
038    /**
039     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
040     @return a copied parser
041     */
042    public Parser newInstance() {
043        return new Parser(this);
044    }
045
046    @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead
047    @Override
048    public Parser clone() {
049        return new Parser(this);
050    }
051
052    private Parser(Parser copy) {
053        treeBuilder = copy.treeBuilder.newInstance(); // because extended
054        errors = new ParseErrorList(copy.errors); // only copies size, not contents
055        settings = new ParseSettings(copy.settings);
056        trackPosition = copy.trackPosition;
057    }
058    
059    public Document parseInput(String html, String baseUri) {
060        return parseInput(new StringReader(html), baseUri);
061    }
062
063    public Document parseInput(Reader inputHtml, String baseUri) {
064        return treeBuilder.parse(inputHtml, baseUri, this);
065    }
066
067    public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
068        return parseFragmentInput(new StringReader(fragment), context, baseUri);
069    }
070
071    public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) {
072        return treeBuilder.parseFragment(fragment, context, baseUri, this);
073    }
074
075    // gets & sets
076    /**
077     * Get the TreeBuilder currently in use.
078     * @return current TreeBuilder.
079     */
080    public TreeBuilder getTreeBuilder() {
081        return treeBuilder;
082    }
083
084    /**
085     * Update the TreeBuilder used when parsing content.
086     * @param treeBuilder new TreeBuilder
087     * @return this, for chaining
088     */
089    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
090        this.treeBuilder = treeBuilder;
091        treeBuilder.parser = this;
092        return this;
093    }
094
095    /**
096     * Check if parse error tracking is enabled.
097     * @return current track error state.
098     */
099    public boolean isTrackErrors() {
100        return errors.getMaxSize() > 0;
101    }
102
103    /**
104     * Enable or disable parse error tracking for the next parse.
105     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
106     * @return this, for chaining
107     */
108    public Parser setTrackErrors(int maxErrors) {
109        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
110        return this;
111    }
112
113    /**
114     * Retrieve the parse errors, if any, from the last parse.
115     * @return list of parse errors, up to the size of the maximum errors tracked.
116     * @see #setTrackErrors(int)
117     */
118    public ParseErrorList getErrors() {
119        return errors;
120    }
121
122    /**
123     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
124     source they were created from. By default, tracking is not enabled.
125     * @return current track position setting
126     */
127    public boolean isTrackPosition() {
128        return trackPosition;
129    }
130
131    /**
132     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
133     input source they were created from.
134     @param trackPosition position tracking setting; {@code true} to enable
135     @return this Parser, for chaining
136     */
137    public Parser setTrackPosition(boolean trackPosition) {
138        this.trackPosition = trackPosition;
139        return this;
140    }
141
142    /**
143     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
144     * @param settings the new settings
145     * @return this Parser
146     */
147    public Parser settings(ParseSettings settings) {
148        this.settings = settings;
149        return this;
150    }
151
152    /**
153     Gets the current ParseSettings for this Parser
154     * @return current ParseSettings
155     */
156    public ParseSettings settings() {
157        return settings;
158    }
159
160    /**
161     (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
162     Data Nodes).
163     */
164    public boolean isContentForTagData(String normalName) {
165        return getTreeBuilder().isContentForTagData(normalName);
166    }
167
168    public String defaultNamespace() {
169        return getTreeBuilder().defaultNamespace();
170    }
171
172    // static parse functions below
173    /**
174     * Parse HTML into a Document.
175     *
176     * @param html HTML to parse
177     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
178     *
179     * @return parsed Document
180     */
181    public static Document parse(String html, String baseUri) {
182        TreeBuilder treeBuilder = new HtmlTreeBuilder();
183        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
184    }
185
186    /**
187     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
188     *
189     * @param fragmentHtml the fragment of HTML to parse
190     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
191     * provides stack context (for implicit element creation).
192     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
193     *
194     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
195     */
196    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
197        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
198        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder));
199    }
200
201    /**
202     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
203     *
204     * @param fragmentHtml the fragment of HTML to parse
205     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
206     * provides stack context (for implicit element creation).
207     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
208     * @param errorList list to add errors to
209     *
210     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
211     */
212    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
213        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
214        Parser parser = new Parser(treeBuilder);
215        parser.errors = errorList;
216        return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser);
217    }
218
219    /**
220     * Parse a fragment of XML into a list of nodes.
221     *
222     * @param fragmentXml the fragment of XML to parse
223     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
224     * @return list of nodes parsed from the input XML.
225     */
226    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
227        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
228        return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder));
229    }
230
231    /**
232     * Parse a fragment of HTML into the {@code body} of a Document.
233     *
234     * @param bodyHtml fragment of HTML
235     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
236     *
237     * @return Document, with empty head, and HTML parsed into body
238     */
239    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
240        Document doc = Document.createShell(baseUri);
241        Element body = doc.body();
242        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
243        Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
244        for (int i = nodes.length - 1; i > 0; i--) {
245            nodes[i].remove();
246        }
247        for (Node node : nodes) {
248            body.appendChild(node);
249        }
250        return doc;
251    }
252
253    /**
254     * Utility method to unescape HTML entities from a string
255     * @param string HTML escaped string
256     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
257     * @return an unescaped string
258     */
259    public static String unescapeEntities(String string, boolean inAttribute) {
260        Parser parser = Parser.htmlParser();
261        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
262        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
263        return tokeniser.unescapeEntities(inAttribute);
264    }
265
266    // builders
267
268    /**
269     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
270     * based on a knowledge of the semantics of the incoming tags.
271     * @return a new HTML parser.
272     */
273    public static Parser htmlParser() {
274        return new Parser(new HtmlTreeBuilder());
275    }
276
277    /**
278     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
279     * rather creates a simple tree directly from the input.
280     * @return a new simple XML parser.
281     */
282    public static Parser xmlParser() {
283        return new Parser(new XmlTreeBuilder());
284    }
285}