001package org.jsoup.parser;
002
003import org.jsoup.nodes.Document;
004import org.jsoup.nodes.Element;
005import org.jsoup.nodes.Node;
006import org.jspecify.annotations.Nullable;
007
008import java.io.Reader;
009import java.io.StringReader;
010import java.util.List;
011
012/**
013 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
014 {@link org.jsoup.Jsoup}.
015 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
016 environment, use {@link #newInstance()} to make copies. */
017public class Parser {
018    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
019    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
020    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
021    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
022
023    private TreeBuilder treeBuilder;
024    private ParseErrorList errors;
025    private ParseSettings settings;
026    private boolean trackPosition = false;
027
028    /**
029     * Create a new Parser, using the specified TreeBuilder
030     * @param treeBuilder TreeBuilder to use to parse input into Documents.
031     */
032    public Parser(TreeBuilder treeBuilder) {
033        this.treeBuilder = treeBuilder;
034        settings = treeBuilder.defaultSettings();
035        errors = ParseErrorList.noTracking();
036    }
037
038    /**
039     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
040     @return a copied parser
041     */
042    public Parser newInstance() {
043        return new Parser(this);
044    }
045
046    private Parser(Parser copy) {
047        treeBuilder = copy.treeBuilder.newInstance(); // because extended
048        errors = new ParseErrorList(copy.errors); // only copies size, not contents
049        settings = new ParseSettings(copy.settings);
050        trackPosition = copy.trackPosition;
051    }
052    
053    public Document parseInput(String html, String baseUri) {
054        return treeBuilder.parse(new StringReader(html), baseUri, this);
055    }
056
057    public Document parseInput(Reader inputHtml, String baseUri) {
058        return treeBuilder.parse(inputHtml, baseUri, this);
059    }
060
061    public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) {
062        return treeBuilder.parseFragment(fragment, context, baseUri, this);
063    }
064    // gets & sets
065    /**
066     * Get the TreeBuilder currently in use.
067     * @return current TreeBuilder.
068     */
069    public TreeBuilder getTreeBuilder() {
070        return treeBuilder;
071    }
072
073    /**
074     * Update the TreeBuilder used when parsing content.
075     * @param treeBuilder new TreeBuilder
076     * @return this, for chaining
077     */
078    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
079        this.treeBuilder = treeBuilder;
080        treeBuilder.parser = this;
081        return this;
082    }
083
084    /**
085     * Check if parse error tracking is enabled.
086     * @return current track error state.
087     */
088    public boolean isTrackErrors() {
089        return errors.getMaxSize() > 0;
090    }
091
092    /**
093     * Enable or disable parse error tracking for the next parse.
094     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
095     * @return this, for chaining
096     */
097    public Parser setTrackErrors(int maxErrors) {
098        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
099        return this;
100    }
101
102    /**
103     * Retrieve the parse errors, if any, from the last parse.
104     * @return list of parse errors, up to the size of the maximum errors tracked.
105     * @see #setTrackErrors(int)
106     */
107    public ParseErrorList getErrors() {
108        return errors;
109    }
110
111    /**
112     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
113     source they were created from. By default, tracking is not enabled.
114     * @return current track position setting
115     */
116    public boolean isTrackPosition() {
117        return trackPosition;
118    }
119
120    /**
121     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
122     input source they were created from.
123     @param trackPosition position tracking setting; {@code true} to enable
124     @return this Parser, for chaining
125     */
126    public Parser setTrackPosition(boolean trackPosition) {
127        this.trackPosition = trackPosition;
128        return this;
129    }
130
131    /**
132     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
133     * @param settings the new settings
134     * @return this Parser
135     */
136    public Parser settings(ParseSettings settings) {
137        this.settings = settings;
138        return this;
139    }
140
141    /**
142     Gets the current ParseSettings for this Parser
143     * @return current ParseSettings
144     */
145    public ParseSettings settings() {
146        return settings;
147    }
148
149    /**
150     (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
151     Data Nodes).
152     */
153    public boolean isContentForTagData(String normalName) {
154        return getTreeBuilder().isContentForTagData(normalName);
155    }
156
157    public String defaultNamespace() {
158        return getTreeBuilder().defaultNamespace();
159    }
160
161    // static parse functions below
162    /**
163     * Parse HTML into a Document.
164     *
165     * @param html HTML to parse
166     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
167     *
168     * @return parsed Document
169     */
170    public static Document parse(String html, String baseUri) {
171        TreeBuilder treeBuilder = new HtmlTreeBuilder();
172        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
173    }
174
175    /**
176     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
177     *
178     * @param fragmentHtml the fragment of HTML to parse
179     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
180     * provides stack context (for implicit element creation).
181     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
182     *
183     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
184     */
185    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
186        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
187        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder));
188    }
189
190    /**
191     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
192     *
193     * @param fragmentHtml the fragment of HTML to parse
194     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
195     * provides stack context (for implicit element creation).
196     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
197     * @param errorList list to add errors to
198     *
199     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
200     */
201    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
202        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
203        Parser parser = new Parser(treeBuilder);
204        parser.errors = errorList;
205        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser);
206    }
207
208    /**
209     * Parse a fragment of XML into a list of nodes.
210     *
211     * @param fragmentXml the fragment of XML to parse
212     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
213     * @return list of nodes parsed from the input XML.
214     */
215    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
216        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
217        return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder));
218    }
219
220    /**
221     * Parse a fragment of HTML into the {@code body} of a Document.
222     *
223     * @param bodyHtml fragment of HTML
224     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
225     *
226     * @return Document, with empty head, and HTML parsed into body
227     */
228    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
229        Document doc = Document.createShell(baseUri);
230        Element body = doc.body();
231        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
232        Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
233        for (int i = nodes.length - 1; i > 0; i--) {
234            nodes[i].remove();
235        }
236        for (Node node : nodes) {
237            body.appendChild(node);
238        }
239        return doc;
240    }
241
242    /**
243     * Utility method to unescape HTML entities from a string
244     * @param string HTML escaped string
245     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
246     * @return an unescaped string
247     */
248    public static String unescapeEntities(String string, boolean inAttribute) {
249        Parser parser = Parser.htmlParser();
250        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
251        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
252        return tokeniser.unescapeEntities(inAttribute);
253    }
254
255    // builders
256
257    /**
258     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
259     * based on a knowledge of the semantics of the incoming tags.
260     * @return a new HTML parser.
261     */
262    public static Parser htmlParser() {
263        return new Parser(new HtmlTreeBuilder());
264    }
265
266    /**
267     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
268     * rather creates a simple tree directly from the input.
269     * @return a new simple XML parser.
270     */
271    public static Parser xmlParser() {
272        return new Parser(new XmlTreeBuilder());
273    }
274}