001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Document; 005import org.jsoup.nodes.Element; 006import org.jsoup.nodes.Node; 007import org.jspecify.annotations.Nullable; 008 009import java.io.Reader; 010import java.io.StringReader; 011import java.util.List; 012import java.util.concurrent.locks.ReentrantLock; 013 014/** 015 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 016 {@link org.jsoup.Jsoup}. 017 <p>Note that a given Parser instance object is threadsafe, but not concurrent. (Concurrent parse calls will 018 synchronize.) To reuse a Parser configuration in a multithreaded environment, use {@link #newInstance()} to make 019 copies.</p> 020 */ 021public class Parser implements Cloneable { 022 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 023 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 024 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 025 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 026 027 private TreeBuilder treeBuilder; 028 private ParseErrorList errors; 029 private ParseSettings settings; 030 private boolean trackPosition = false; 031 private @Nullable TagSet tagSet; 032 private final ReentrantLock lock = new ReentrantLock(); 033 034 /** 035 * Create a new Parser, using the specified TreeBuilder 036 * @param treeBuilder TreeBuilder to use to parse input into Documents. 037 */ 038 public Parser(TreeBuilder treeBuilder) { 039 this.treeBuilder = treeBuilder; 040 settings = treeBuilder.defaultSettings(); 041 errors = ParseErrorList.noTracking(); 042 } 043 044 /** 045 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 046 @return a copied parser 047 */ 048 public Parser newInstance() { 049 return new Parser(this); 050 } 051 052 @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead 053 @Override 054 public Parser clone() { 055 return new Parser(this); 056 } 057 058 private Parser(Parser copy) { 059 treeBuilder = copy.treeBuilder.newInstance(); // because extended 060 errors = new ParseErrorList(copy.errors); // only copies size, not contents 061 settings = new ParseSettings(copy.settings); 062 trackPosition = copy.trackPosition; 063 } 064 065 public Document parseInput(String html, String baseUri) { 066 return parseInput(new StringReader(html), baseUri); 067 } 068 069 public Document parseInput(Reader inputHtml, String baseUri) { 070 try { 071 lock.lock(); // using a lock vs synchronized to support loom threads 072 return treeBuilder.parse(inputHtml, baseUri, this); 073 } finally { 074 lock.unlock(); 075 } 076 } 077 078 public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) { 079 return parseFragmentInput(new StringReader(fragment), context, baseUri); 080 } 081 082 public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) { 083 try { 084 lock.lock(); 085 return treeBuilder.parseFragment(fragment, context, baseUri, this); 086 } finally { 087 lock.unlock(); 088 } 089 } 090 091 // gets & sets 092 /** 093 * Get the TreeBuilder currently in use. 094 * @return current TreeBuilder. 095 */ 096 public TreeBuilder getTreeBuilder() { 097 return treeBuilder; 098 } 099 100 /** 101 * Update the TreeBuilder used when parsing content. 102 * @param treeBuilder new TreeBuilder 103 * @return this, for chaining 104 * @deprecated unused method, will be removed in 1.21.1 105 */ 106 @Deprecated public Parser setTreeBuilder(TreeBuilder treeBuilder) { 107 this.treeBuilder = treeBuilder; 108 treeBuilder.parser = this; 109 return this; 110 } 111 112 /** 113 * Check if parse error tracking is enabled. 114 * @return current track error state. 115 */ 116 public boolean isTrackErrors() { 117 return errors.getMaxSize() > 0; 118 } 119 120 /** 121 * Enable or disable parse error tracking for the next parse. 122 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 123 * @return this, for chaining 124 */ 125 public Parser setTrackErrors(int maxErrors) { 126 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 127 return this; 128 } 129 130 /** 131 * Retrieve the parse errors, if any, from the last parse. 132 * @return list of parse errors, up to the size of the maximum errors tracked. 133 * @see #setTrackErrors(int) 134 */ 135 public ParseErrorList getErrors() { 136 return errors; 137 } 138 139 /** 140 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 141 source they were created from. By default, tracking is not enabled. 142 * @return current track position setting 143 */ 144 public boolean isTrackPosition() { 145 return trackPosition; 146 } 147 148 /** 149 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 150 input source they were created from. 151 @param trackPosition position tracking setting; {@code true} to enable 152 @return this Parser, for chaining 153 */ 154 public Parser setTrackPosition(boolean trackPosition) { 155 this.trackPosition = trackPosition; 156 return this; 157 } 158 159 /** 160 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 161 * @param settings the new settings 162 * @return this Parser 163 */ 164 public Parser settings(ParseSettings settings) { 165 this.settings = settings; 166 return this; 167 } 168 169 /** 170 Gets the current ParseSettings for this Parser 171 * @return current ParseSettings 172 */ 173 public ParseSettings settings() { 174 return settings; 175 } 176 177 /** 178 Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are 179 parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag. 180 <p>You can start with the {@link TagSet#Html()} defaults and customize, or a new empty TagSet.</p> 181 182 @param tagSet the TagSet to use. This gets copied, so that changes that the parse makes (tags found in the document will be added) do not clobber the original TagSet. 183 @return this Parser 184 @since 1.20.1 185 */ 186 public Parser tagSet(TagSet tagSet) { 187 Validate.notNull(tagSet); 188 this.tagSet = new TagSet(tagSet); // copy it as we are going to mutate it 189 return this; 190 } 191 192 /** 193 Get the current TagSet for this Parser, which will be either this parser's default, or one that you have set. 194 @return the current TagSet. After the parse, this will contain any new tags that were found in the document. 195 @since 1.20.1 196 */ 197 public TagSet tagSet() { 198 if (tagSet == null) 199 tagSet = treeBuilder.defaultTagSet(); 200 return tagSet; 201 } 202 203 /** 204 (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as 205 Data Nodes). 206 @deprecated internal method, no longer used, and will be removed in 1.12.1. 207 */ 208 @Deprecated public boolean isContentForTagData(String normalName) { 209 return tagSet().valueOf(normalName, defaultNamespace()).is(Tag.Data); 210 } 211 212 public String defaultNamespace() { 213 return getTreeBuilder().defaultNamespace(); 214 } 215 216 // static parse functions below 217 /** 218 * Parse HTML into a Document. 219 * 220 * @param html HTML to parse 221 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 222 * 223 * @return parsed Document 224 */ 225 public static Document parse(String html, String baseUri) { 226 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 227 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 228 } 229 230 /** 231 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 232 * 233 * @param fragmentHtml the fragment of HTML to parse 234 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 235 * provides stack context (for implicit element creation). 236 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 237 * 238 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 239 */ 240 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 241 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 242 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder)); 243 } 244 245 /** 246 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 247 * 248 * @param fragmentHtml the fragment of HTML to parse 249 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 250 * provides stack context (for implicit element creation). 251 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 252 * @param errorList list to add errors to 253 * 254 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 255 */ 256 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 257 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 258 Parser parser = new Parser(treeBuilder); 259 parser.errors = errorList; 260 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser); 261 } 262 263 /** 264 * Parse a fragment of XML into a list of nodes. 265 * 266 * @param fragmentXml the fragment of XML to parse 267 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 268 * @return list of nodes parsed from the input XML. 269 */ 270 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 271 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 272 return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder)); 273 } 274 275 /** 276 * Parse a fragment of HTML into the {@code body} of a Document. 277 * 278 * @param bodyHtml fragment of HTML 279 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 280 * 281 * @return Document, with empty head, and HTML parsed into body 282 */ 283 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 284 Document doc = Document.createShell(baseUri); 285 Element body = doc.body(); 286 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 287 Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented 288 for (int i = nodes.length - 1; i > 0; i--) { 289 nodes[i].remove(); 290 } 291 for (Node node : nodes) { 292 body.appendChild(node); 293 } 294 return doc; 295 } 296 297 /** 298 * Utility method to unescape HTML entities from a string 299 * @param string HTML escaped string 300 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 301 * @return an unescaped string 302 */ 303 public static String unescapeEntities(String string, boolean inAttribute) { 304 Parser parser = Parser.htmlParser(); 305 parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); 306 Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); 307 return tokeniser.unescapeEntities(inAttribute); 308 } 309 310 // builders 311 312 /** 313 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 314 * based on a knowledge of the semantics of the incoming tags. 315 * @return a new HTML parser. 316 */ 317 public static Parser htmlParser() { 318 return new Parser(new HtmlTreeBuilder()); 319 } 320 321 /** 322 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 323 * rather creates a simple tree directly from the input. 324 * @return a new simple XML parser. 325 */ 326 public static Parser xmlParser() { 327 return new Parser(new XmlTreeBuilder()); 328 } 329}