001package org.jsoup.parser; 002 003import org.jsoup.nodes.Document; 004import org.jsoup.nodes.Element; 005import org.jsoup.nodes.Node; 006import org.jspecify.annotations.Nullable; 007 008import java.io.Reader; 009import java.io.StringReader; 010import java.util.List; 011 012/** 013 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in 014 {@link org.jsoup.Jsoup}. 015 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded 016 environment, use {@link #newInstance()} to make copies. */ 017public class Parser implements Cloneable { 018 public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml"; 019 public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace"; 020 public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML"; 021 public static final String NamespaceSvg = "http://www.w3.org/2000/svg"; 022 023 private TreeBuilder treeBuilder; 024 private ParseErrorList errors; 025 private ParseSettings settings; 026 private boolean trackPosition = false; 027 028 /** 029 * Create a new Parser, using the specified TreeBuilder 030 * @param treeBuilder TreeBuilder to use to parse input into Documents. 031 */ 032 public Parser(TreeBuilder treeBuilder) { 033 this.treeBuilder = treeBuilder; 034 settings = treeBuilder.defaultSettings(); 035 errors = ParseErrorList.noTracking(); 036 } 037 038 /** 039 Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use. 040 @return a copied parser 041 */ 042 public Parser newInstance() { 043 return new Parser(this); 044 } 045 046 @SuppressWarnings("MethodDoesntCallSuperMethod") // because we use the copy constructor instead 047 @Override 048 public Parser clone() { 049 return new Parser(this); 050 } 051 052 private Parser(Parser copy) { 053 treeBuilder = copy.treeBuilder.newInstance(); // because extended 054 errors = new ParseErrorList(copy.errors); // only copies size, not contents 055 settings = new ParseSettings(copy.settings); 056 trackPosition = copy.trackPosition; 057 } 058 059 public Document parseInput(String html, String baseUri) { 060 return parseInput(new StringReader(html), baseUri); 061 } 062 063 public Document parseInput(Reader inputHtml, String baseUri) { 064 return treeBuilder.parse(inputHtml, baseUri, this); 065 } 066 067 public List<Node> parseFragmentInput(String fragment, @Nullable Element context, String baseUri) { 068 return parseFragmentInput(new StringReader(fragment), context, baseUri); 069 } 070 071 public List<Node> parseFragmentInput(Reader fragment, @Nullable Element context, String baseUri) { 072 return treeBuilder.parseFragment(fragment, context, baseUri, this); 073 } 074 075 // gets & sets 076 /** 077 * Get the TreeBuilder currently in use. 078 * @return current TreeBuilder. 079 */ 080 public TreeBuilder getTreeBuilder() { 081 return treeBuilder; 082 } 083 084 /** 085 * Update the TreeBuilder used when parsing content. 086 * @param treeBuilder new TreeBuilder 087 * @return this, for chaining 088 */ 089 public Parser setTreeBuilder(TreeBuilder treeBuilder) { 090 this.treeBuilder = treeBuilder; 091 treeBuilder.parser = this; 092 return this; 093 } 094 095 /** 096 * Check if parse error tracking is enabled. 097 * @return current track error state. 098 */ 099 public boolean isTrackErrors() { 100 return errors.getMaxSize() > 0; 101 } 102 103 /** 104 * Enable or disable parse error tracking for the next parse. 105 * @param maxErrors the maximum number of errors to track. Set to 0 to disable. 106 * @return this, for chaining 107 */ 108 public Parser setTrackErrors(int maxErrors) { 109 errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking(); 110 return this; 111 } 112 113 /** 114 * Retrieve the parse errors, if any, from the last parse. 115 * @return list of parse errors, up to the size of the maximum errors tracked. 116 * @see #setTrackErrors(int) 117 */ 118 public ParseErrorList getErrors() { 119 return errors; 120 } 121 122 /** 123 Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input 124 source they were created from. By default, tracking is not enabled. 125 * @return current track position setting 126 */ 127 public boolean isTrackPosition() { 128 return trackPosition; 129 } 130 131 /** 132 Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original 133 input source they were created from. 134 @param trackPosition position tracking setting; {@code true} to enable 135 @return this Parser, for chaining 136 */ 137 public Parser setTrackPosition(boolean trackPosition) { 138 this.trackPosition = trackPosition; 139 return this; 140 } 141 142 /** 143 Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes. 144 * @param settings the new settings 145 * @return this Parser 146 */ 147 public Parser settings(ParseSettings settings) { 148 this.settings = settings; 149 return this; 150 } 151 152 /** 153 Gets the current ParseSettings for this Parser 154 * @return current ParseSettings 155 */ 156 public ParseSettings settings() { 157 return settings; 158 } 159 160 /** 161 (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as 162 Data Nodes). 163 */ 164 public boolean isContentForTagData(String normalName) { 165 return getTreeBuilder().isContentForTagData(normalName); 166 } 167 168 public String defaultNamespace() { 169 return getTreeBuilder().defaultNamespace(); 170 } 171 172 // static parse functions below 173 /** 174 * Parse HTML into a Document. 175 * 176 * @param html HTML to parse 177 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 178 * 179 * @return parsed Document 180 */ 181 public static Document parse(String html, String baseUri) { 182 TreeBuilder treeBuilder = new HtmlTreeBuilder(); 183 return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder)); 184 } 185 186 /** 187 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 188 * 189 * @param fragmentHtml the fragment of HTML to parse 190 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 191 * provides stack context (for implicit element creation). 192 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 193 * 194 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 195 */ 196 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) { 197 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 198 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, new Parser(treeBuilder)); 199 } 200 201 /** 202 * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context. 203 * 204 * @param fragmentHtml the fragment of HTML to parse 205 * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This 206 * provides stack context (for implicit element creation). 207 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 208 * @param errorList list to add errors to 209 * 210 * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified. 211 */ 212 public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) { 213 HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder(); 214 Parser parser = new Parser(treeBuilder); 215 parser.errors = errorList; 216 return treeBuilder.parseFragment(new StringReader(fragmentHtml), context, baseUri, parser); 217 } 218 219 /** 220 * Parse a fragment of XML into a list of nodes. 221 * 222 * @param fragmentXml the fragment of XML to parse 223 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 224 * @return list of nodes parsed from the input XML. 225 */ 226 public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) { 227 XmlTreeBuilder treeBuilder = new XmlTreeBuilder(); 228 return treeBuilder.parseFragment(new StringReader(fragmentXml), null, baseUri, new Parser(treeBuilder)); 229 } 230 231 /** 232 * Parse a fragment of HTML into the {@code body} of a Document. 233 * 234 * @param bodyHtml fragment of HTML 235 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. 236 * 237 * @return Document, with empty head, and HTML parsed into body 238 */ 239 public static Document parseBodyFragment(String bodyHtml, String baseUri) { 240 Document doc = Document.createShell(baseUri); 241 Element body = doc.body(); 242 List<Node> nodeList = parseFragment(bodyHtml, body, baseUri); 243 Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented 244 for (int i = nodes.length - 1; i > 0; i--) { 245 nodes[i].remove(); 246 } 247 for (Node node : nodes) { 248 body.appendChild(node); 249 } 250 return doc; 251 } 252 253 /** 254 * Utility method to unescape HTML entities from a string 255 * @param string HTML escaped string 256 * @param inAttribute if the string is to be escaped in strict mode (as attributes are) 257 * @return an unescaped string 258 */ 259 public static String unescapeEntities(String string, boolean inAttribute) { 260 Parser parser = Parser.htmlParser(); 261 parser.treeBuilder.initialiseParse(new StringReader(string), "", parser); 262 Tokeniser tokeniser = new Tokeniser(parser.treeBuilder); 263 return tokeniser.unescapeEntities(inAttribute); 264 } 265 266 // builders 267 268 /** 269 * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document, 270 * based on a knowledge of the semantics of the incoming tags. 271 * @return a new HTML parser. 272 */ 273 public static Parser htmlParser() { 274 return new Parser(new HtmlTreeBuilder()); 275 } 276 277 /** 278 * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML, 279 * rather creates a simple tree directly from the input. 280 * @return a new simple XML parser. 281 */ 282 public static Parser xmlParser() { 283 return new Parser(new XmlTreeBuilder()); 284 } 285}