001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Document; 005import org.jsoup.nodes.Element; 006import org.jsoup.nodes.Node; 007import org.jspecify.annotations.Nullable; 008 009import java.io.Reader; 010import java.io.StringReader; 011import java.util.List; 012import java.util.concurrent.locks.ReentrantLock; 013 014/** 015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 016 {@link org.jsoup.Jsoup}. 017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will 018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make 019 copies.</p> 020 */ 021public class Parser implements Cloneable { 022 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 023 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 024 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 025 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 026 027 private final TreeBuilder treeBuilder; 028 private ParseErrorList errors; 029 private ParseSettings settings; 030 private boolean trackPosition = false; 031 private @Nullable TagSet tagSet; 032 private final ReentrantLock lock = new ReentrantLock(); 033 034 /** 035 * Create a new Parser, using the specified TreeBuilder 036 * @param treeBuilder TreeBuilder to use to parse input into Documents. 037 */ 038 public Parser(TreeBuilder treeBuilder) { 039 this.treeBuilder = treeBuilder; 040 settings = treeBuilder.defaultSettings(); 041 errors = ParseErrorList.noTracking(); 042 } 043 044 /** 045 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 046 @return a copied parser 047 */ 048 public Parser newInstance() { 049 return new Parser(this); 050 } 051 052 @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead 053 @Override 054 public Parser clone() { 055 return new Parser(this); 056 } 057 058 private Parser(Parser copy) { 059 treeBuilder = copy.treeBuilder.newInstance(); // because extended 060 errors = new ParseErrorList(copy.errors); // only copies size, not contents 061 settings = new ParseSettings(copy.settings); 062 trackPosition = copy.trackPosition; 063 } 064 065 /** 066 Parse the contents of a String. 067 068 @param html HTML to parse 069 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 070 @return parsed Document 071 */ 072 public Document parseInput(String html, String baseUri) { 073 return parseInput(new StringReader(html), baseUri); 074 } 075 076 /** 077 Parse the contents of Reader. 078 079 @param inputHtml HTML to parse 080 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 081 @return parsed Document 082 @throws java.io.UncheckedIOException if an I/O error occurs in the Reader 083 */ 084 public Document parseInput(Reader inputHtml, String baseUri) { 085 try { 086 lock.lock(); // using a lock vs synchronized to support loom threads 087 return treeBuilder.parse(inputHtml, baseUri, this); 088 } finally { 089 lock.unlock(); 090 } 091 } 092 093 /** 094 Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 095 096 @param fragment the fragment of HTML to parse 097 @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). 098 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 099 @return list of nodes parsed from the input HTML. 100 */ 101 public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) { 102 return parseFragmentInput(new StringReader(fragment), context, baseUri); 103 } 104 105 /** 106 Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 107 108 @param fragment the fragment of HTML to parse 109 @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). 110 @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 111 @return list of nodes parsed from the input HTML. 112 @throws java.io.UncheckedIOException if an I/O error occurs in the Reader 113 */ 114 public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) { 115 try { 116 lock.lock(); 117 return treeBuilder.parseFragment(fragment, context, baseUri, this); 118 } finally { 119 lock.unlock(); 120 } 121 } 122 123 // gets & sets 124 /** 125 * Get the TreeBuilder currently in use. 126 * @return current TreeBuilder. 127 */ 128 public TreeBuilder getTreeBuilder() { 129 return treeBuilder; 130 } 131 132 /** 133 * Check if parse error tracking is enabled. 134 * @return current track error state. 135 */ 136 public boolean isTrackErrors() { 137 return errors.getMaxSize() > 0; 138 } 139 140 /** 141 * Enable or disable parse error tracking for the next parse. 142 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 143 * @return this, for chaining 144 */ 145 public Parser setTrackErrors(int maxErrors) { 146 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 147 return this; 148 } 149 150 /** 151 * Retrieve the parse errors, if any, from the last parse. 152 * @return list of parse errors, up to the size of the maximum errors tracked. 153 * @see #setTrackErrors(int) 154 */ 155 public ParseErrorList getErrors() { 156 return errors; 157 } 158 159 /** 160 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 161 source they were created from. By default, tracking is not enabled. 162 * @return current track position setting 163 */ 164 public boolean isTrackPosition() { 165 return trackPosition; 166 } 167 168 /** 169 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 170 input source they were created from. 171 @param trackPosition position tracking setting; {@code true} to enable 172 @return this Parser, for chaining 173 */ 174 public Parser setTrackPosition(boolean trackPosition) { 175 this.trackPosition = trackPosition; 176 return this; 177 } 178 179 /** 180 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 181 * @param settings the new settings 182 * @return this Parser 183 */ 184 public Parser settings(ParseSettings settings) { 185 this.settings = settings; 186 return this; 187 } 188 189 /** 190 Gets the current ParseSettings for this Parser 191 * @return current ParseSettings 192 */ 193 public ParseSettings settings() { 194 return settings; 195 } 196 197 /** 198 Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are 199 parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag. 200 <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p> 201 202 @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet. 203 @return this Parser 204 @since 1.20.1 205 */ 206 public Parser tagSet(TagSet tagSet) { 207 Validate.notNull(tagSet); 208 this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it 209 return this; 210 } 211 212 /** 213 Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set. 214 @return the current TagSet. After the parse, this will contain any new tags that were found in the document. 215 @since 1.20.1 216 */ 217 public TagSet tagSet() { 218 if (tagSet == null) 219 tagSet = treeBuilder.defaultTagSet(); 220 return tagSet; 221 } 222 223 public String defaultNamespace() { 224 return getTreeBuilder().defaultNamespace(); 225 } 226 227 // static parse functions below 228 /** 229 * Parse HTML into a Document. 230 * 231 * @param html HTML to parse 232 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 233 * 234 * @return parsed Document 235 */ 236 public static Document parse(String html, String baseUri) { 237 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 238 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 239 } 240 241 /** 242 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 243 * 244 * @param fragmentHtml the fragment of HTML to parse 245 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 246 * provides stack context (for implicit element creation). 247 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 248 * 249 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 250 */ 251 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 252 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 253 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder)); 254 } 255 256 /** 257 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 258 * 259 * @param fragmentHtml the fragment of HTML to parse 260 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 261 * provides stack context (for implicit element creation). 262 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 263 * @param errorList list to add errors to 264 * 265 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 266 */ 267 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 268 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 269 Parser parser = new Parser(treeBuilder); 270 parser.errors = errorList; 271 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser); 272 } 273 274 /** 275 * Parse a fragment of XML into a list of nodes. 276 * 277 * @param fragmentXml the fragment of XML to parse 278 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 279 * @return list of nodes parsed from the input XML. 280 */ 281 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 282 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 283 return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder)); 284 } 285 286 /** 287 * Parse a fragment of HTML into the {@code body} of a Document. 288 * 289 * @param bodyHtml fragment of HTML 290 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 291 * 292 * @return Document, with empty head, and HTML parsed into body 293 */ 294 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 295 Document doc = Document.createShell(baseUri); 296 Element body = doc.body(); 297 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 298 Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented 299 for (int i = nodes.length - 1; i > 0; i--) { 300 nodes[i].remove(); 301 } 302 for (Node node : nodes) { 303 body.appendChild(node); 304 } 305 return doc; 306 } 307 308 /** 309 * Utility method to unescape HTML entities from a string 310 * @param string HTML escaped string 311 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 312 * @return an unescaped string 313 */ 314 public static String unescapeEntities(String string, boolean inAttribute) { 315 Parser parser = Parser.htmlParser(); 316 parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); 317 Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); 318 return tokeniser.unescapeEntities(inAttribute); 319 } 320 321 // builders 322 323 /** 324 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 325 * based on a knowledge of the semantics of the incoming tags. 326 * @return a new HTML parser. 327 */ 328 public static Parser htmlParser() { 329 return new Parser(new HtmlTreeBuilder()); 330 } 331 332 /** 333 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 334 * rather creates a simple tree directly from the input. 335 * @return a new simple XML parser. 336 */ 337 public static Parser xmlParser() { 338 return new Parser(new XmlTreeBuilder()); 339 } 340}