001package org.jsoup.parser; 002 003import org.jsoup.nodes.Document; 004import org.jsoup.nodes.Element; 005import org.jsoup.nodes.Node; 006import org.jspecify.annotations.Nullable; 007 008import java.io.Reader; 009import java.io.StringReader; 010import java.util.List; 011 012/** 013 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 014 {@link org.jsoup.Jsoup}. 015 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded 016 environment, use {@link #newInstance()} to make copies. */ 017public class Parser { 018 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 019 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 020 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 021 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 022 023 private TreeBuilder treeBuilder; 024 private ParseErrorList errors; 025 private ParseSettings settings; 026 private boolean trackPosition = false; 027 028 /** 029 * Create a new Parser, using the specified TreeBuilder 030 * @param treeBuilder TreeBuilder to use to parse input into Documents. 031 */ 032 public Parser(TreeBuilder treeBuilder) { 033 this.treeBuilder = treeBuilder; 034 settings = treeBuilder.defaultSettings(); 035 errors = ParseErrorList.noTracking(); 036 } 037 038 /** 039 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 040 @return a copied parser 041 */ 042 public Parser newInstance() { 043 return new Parser(this); 044 } 045 046 private Parser(Parser copy) { 047 treeBuilder = copy.treeBuilder.newInstance(); // because extended 048 errors = new ParseErrorList(copy.errors); // only copies size, not contents 049 settings = new ParseSettings(copy.settings); 050 trackPosition = copy.trackPosition; 051 } 052 053 public Document parseInput(String html, String baseUri) { 054 return treeBuilder.parse(new StringReader(html), baseUri, this); 055 } 056 057 public Document parseInput(Reader inputHtml, String baseUri) { 058 return treeBuilder.parse(inputHtml, baseUri, this); 059 } 060 061 public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) { 062 return treeBuilder.parseFragment(fragment, context, baseUri, this); 063 } 064 // gets & sets 065 /** 066 * Get the TreeBuilder currently in use. 067 * @return current TreeBuilder. 068 */ 069 public TreeBuilder getTreeBuilder() { 070 return treeBuilder; 071 } 072 073 /** 074 * Update the TreeBuilder used when parsing content. 075 * @param treeBuilder new TreeBuilder 076 * @return this, for chaining 077 */ 078 public Parser setTreeBuilder(TreeBuilder treeBuilder) { 079 this.treeBuilder = treeBuilder; 080 treeBuilder.parser = this; 081 return this; 082 } 083 084 /** 085 * Check if parse error tracking is enabled. 086 * @return current track error state. 087 */ 088 public boolean isTrackErrors() { 089 return errors.getMaxSize() > 0; 090 } 091 092 /** 093 * Enable or disable parse error tracking for the next parse. 094 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 095 * @return this, for chaining 096 */ 097 public Parser setTrackErrors(int maxErrors) { 098 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 099 return this; 100 } 101 102 /** 103 * Retrieve the parse errors, if any, from the last parse. 104 * @return list of parse errors, up to the size of the maximum errors tracked. 105 * @see #setTrackErrors(int) 106 */ 107 public ParseErrorList getErrors() { 108 return errors; 109 } 110 111 /** 112 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 113 source they were created from. By default, tracking is not enabled. 114 * @return current track position setting 115 */ 116 public boolean isTrackPosition() { 117 return trackPosition; 118 } 119 120 /** 121 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 122 input source they were created from. 123 @param trackPosition position tracking setting; {@code true} to enable 124 @return this Parser, for chaining 125 */ 126 public Parser setTrackPosition(boolean trackPosition) { 127 this.trackPosition = trackPosition; 128 return this; 129 } 130 131 /** 132 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 133 * @param settings the new settings 134 * @return this Parser 135 */ 136 public Parser settings(ParseSettings settings) { 137 this.settings = settings; 138 return this; 139 } 140 141 /** 142 Gets the current ParseSettings for this Parser 143 * @return current ParseSettings 144 */ 145 public ParseSettings settings() { 146 return settings; 147 } 148 149 /** 150 (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as 151 Data Nodes). 152 */ 153 public boolean isContentForTagData(String normalName) { 154 return getTreeBuilder().isContentForTagData(normalName); 155 } 156 157 public String defaultNamespace() { 158 return getTreeBuilder().defaultNamespace(); 159 } 160 161 // static parse functions below 162 /** 163 * Parse HTML into a Document. 164 * 165 * @param html HTML to parse 166 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 167 * 168 * @return parsed Document 169 */ 170 public static Document parse(String html, String baseUri) { 171 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 172 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 173 } 174 175 /** 176 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 177 * 178 * @param fragmentHtml the fragment of HTML to parse 179 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 180 * provides stack context (for implicit element creation). 181 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 182 * 183 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 184 */ 185 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 186 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 187 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder)); 188 } 189 190 /** 191 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 192 * 193 * @param fragmentHtml the fragment of HTML to parse 194 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 195 * provides stack context (for implicit element creation). 196 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 197 * @param errorList list to add errors to 198 * 199 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 200 */ 201 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 202 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 203 Parser parser = new Parser(treeBuilder); 204 parser.errors = errorList; 205 return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser); 206 } 207 208 /** 209 * Parse a fragment of XML into a list of nodes. 210 * 211 * @param fragmentXml the fragment of XML to parse 212 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 213 * @return list of nodes parsed from the input XML. 214 */ 215 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 216 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 217 return treeBuilder.parseFragment(fragmentXml, null, baseUri, new Parser(treeBuilder)); 218 } 219 220 /** 221 * Parse a fragment of HTML into the {@code body} of a Document. 222 * 223 * @param bodyHtml fragment of HTML 224 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 225 * 226 * @return Document, with empty head, and HTML parsed into body 227 */ 228 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 229 Document doc = Document.createShell(baseUri); 230 Element body = doc.body(); 231 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 232 Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented 233 for (int i = nodes.length - 1; i > 0; i--) { 234 nodes[i].remove(); 235 } 236 for (Node node : nodes) { 237 body.appendChild(node); 238 } 239 return doc; 240 } 241 242 /** 243 * Utility method to unescape HTML entities from a string 244 * @param string HTML escaped string 245 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 246 * @return an unescaped string 247 */ 248 public static String unescapeEntities(String string, boolean inAttribute) { 249 Parser parser = Parser.htmlParser(); 250 parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); 251 Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); 252 return tokeniser.unescapeEntities(inAttribute); 253 } 254 255 // builders 256 257 /** 258 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 259 * based on a knowledge of the semantics of the incoming tags. 260 * @return a new HTML parser. 261 */ 262 public static Parser htmlParser() { 263 return new Parser(new HtmlTreeBuilder()); 264 } 265 266 /** 267 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 268 * rather creates a simple tree directly from the input. 269 * @return a new simple XML parser. 270 */ 271 public static Parser xmlParser() { 272 return new Parser(new XmlTreeBuilder()); 273 } 274}