001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.SimpleStreamReader; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.Document; 010import org.jsoup.nodes.Element; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.XmlDeclaration; 013import org.jsoup.parser.Parser; 014import org.jsoup.parser.StreamParser; 015import org.jsoup.select.Elements; 016import org.jsoup.select.Evaluator; 017import org.jsoup.select.Selector; 018import org.jspecify.annotations.Nullable; 019 020import java.io.File; 021import java.io.IOException; 022import java.io.InputStream; 023import java.io.OutputStream; 024import java.io.Reader; 025import java.io.UncheckedIOException; 026import java.nio.ByteBuffer; 027import java.nio.channels.Channels; 028import java.nio.channels.SeekableByteChannel; 029import java.nio.charset.Charset; 030import java.nio.charset.IllegalCharsetNameException; 031import java.nio.file.Files; 032import java.nio.file.Path; 033import java.util.Locale; 034import java.util.Random; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037import java.util.zip.GZIPInputStream; 038 039import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 040 041/** 042 * Internal static utilities for handling data. 043 * 044 */ 045@SuppressWarnings("CharsetObjectCanBeUsed") 046public final class DataUtil { 047 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 048 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 049 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 050 private static final int firstReadBufferSize = 1024 * 5; 051 private static final char[] mimeBoundaryChars = 052 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 053 static final int boundaryLength = 32; 054 055 private DataUtil() {} 056 057 /** 058 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 059 * are supported in addition to uncompressed files. 060 * 061 * @param file file to load 062 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 063 * the file will always override this setting. 064 * @param baseUri base URI of document, to resolve relative links against 065 * @return Document 066 * @throws IOException on IO error 067 */ 068 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 069 return load(file.toPath(), charsetName, baseUri); 070 } 071 072 /** 073 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 074 * are supported in addition to uncompressed files. 075 * 076 * @param file file to load 077 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 078 * the file will always override this setting. 079 * @param baseUri base URI of document, to resolve relative links against 080 * @param parser alternate {@link Parser#xmlParser() parser} to use. 081 082 * @return Document 083 * @throws IOException on IO error 084 * @since 1.14.2 085 */ 086 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 087 return load(file.toPath(), charsetName, baseUri, parser); 088 } 089 090 /** 091 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 092 * are supported in addition to uncompressed files. 093 * 094 * @param path file to load 095 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 096 * the file will always override this setting. 097 * @param baseUri base URI of document, to resolve relative links against 098 * @return Document 099 * @throws IOException on IO error 100 */ 101 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 102 return load(path, charsetName, baseUri, Parser.htmlParser()); 103 } 104 105 /** 106 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 107 * are supported in addition to uncompressed files. 108 * 109 * @param path file to load 110 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 111 * the file will always override this setting. 112 * @param baseUri base URI of document, to resolve relative links against 113 * @param parser alternate {@link Parser#xmlParser() parser} to use. 114 115 * @return Document 116 * @throws IOException on IO error 117 * @since 1.17.2 118 */ 119 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 120 return parseInputStream(openStream(path), charsetName, baseUri, parser); 121 } 122 123 /** 124 * Returns a {@link StreamParser} that will parse the supplied file progressively. 125 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 126 * are supported in addition to uncompressed files. 127 * 128 * @param path file to load 129 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 130 * A BOM in the file will always override this setting. 131 * @param baseUri base URI of document, to resolve relative links against 132 * @param parser underlying HTML or XML parser to use. 133 134 * @return Document 135 * @throws IOException on IO error 136 * @since 1.18.2 137 * @see Connection.Response#streamParser() 138 */ 139 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 140 StreamParser streamer = new StreamParser(parser); 141 String charsetName = charset != null? charset.name() : null; 142 try { 143 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); 144 Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset); 145 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 146 } catch (IOException e) { 147 streamer.close(); 148 throw e; 149 } 150 return streamer; 151 } 152 153 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 154 private static ControllableInputStream openStream(Path path) throws IOException { 155 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 156 InputStream stream = Channels.newInputStream(byteChannel); 157 String name = Normalizer.lowerCase(path.getFileName().toString()); 158 if (name.endsWith(".gz") || name.endsWith(".z")) { 159 try { 160 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 161 byteChannel.position(0); // reset to start of file 162 if (zipped) stream = new GZIPInputStream(stream); 163 } catch (IOException e) { 164 stream.close(); // error during our first read; close the stream and cascade close byteChannel 165 throw e; 166 } 167 } 168 return ControllableInputStream.wrap(stream, 0); 169 } 170 171 /** 172 * Parses a Document from an input steam. 173 * @param in input stream to parse. The stream will be closed after reading. 174 * @param charsetName character set of input (optional) 175 * @param baseUri base URI of document, to resolve relative links against 176 * @return Document 177 * @throws IOException on IO error 178 */ 179 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 180 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); 181 } 182 183 /** 184 * Parses a Document from an input steam, using the provided Parser. 185 * @param in input stream to parse. The stream will be closed after reading. 186 * @param charsetName character set of input (optional) 187 * @param baseUri base URI of document, to resolve relative links against 188 * @param parser alternate {@link Parser#xmlParser() parser} to use. 189 * @return Document 190 * @throws IOException on IO error 191 */ 192 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 193 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); 194 } 195 196 /** 197 * Writes the input stream to the output stream. Doesn't close them. 198 * @param in input stream to read from 199 * @param out output stream to write to 200 * @throws IOException on IO error 201 */ 202 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 203 final byte[] buffer = new byte[DefaultBufferSize]; 204 int len; 205 while ((len = in.read(buffer)) != -1) { 206 out.write(buffer, 0, len); 207 } 208 } 209 210 /** A struct to return a detected charset, and a document (if fully read). */ 211 static class CharsetDoc { 212 Charset charset; 213 InputStream input; 214 @Nullable Document doc; 215 216 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { 217 this.charset = charset; 218 this.input = input; 219 this.doc = doc; 220 } 221 } 222 223 static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 224 if (input == null) return new Document(baseUri); // empty body 225 226 final Document doc; 227 CharsetDoc charsetDoc = null; 228 try { 229 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 230 doc = parseInputStream(charsetDoc, baseUri, parser); 231 } finally { 232 if (charsetDoc != null) 233 charsetDoc.input.close(); 234 } 235 return doc; 236 } 237 238 private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]"); 239 240 static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 241 Document doc = null; 242 // read the start of the stream and look for a BOM or meta charset: 243 // look for BOM - overrides any other header or input 244 String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately 245 if (bomCharset != null) 246 charsetName = bomCharset; 247 248 if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 249 int origMax = input.max(); 250 input.max(firstReadBufferSize); 251 input.mark(firstReadBufferSize); 252 input.allowClose(false); // ignores closes during parse, in case we need to rewind 253 try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize 254 doc = parser.parseInput(reader, baseUri); 255 input.reset(); 256 input.max(origMax); // reset for a full read if required 257 } catch (UncheckedIOException e) { 258 throw e.getCause(); 259 } finally { 260 input.allowClose(true); 261 } 262 263 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 264 Elements metaElements = doc.select(metaCharset); 265 String foundCharset = null; // if not found, will keep utf-8 as best attempt 266 for (Element meta : metaElements) { 267 if (meta.hasAttr("http-equiv")) 268 foundCharset = getCharsetFromContentType(meta.attr("content")); 269 if (foundCharset == null && meta.hasAttr("charset")) 270 foundCharset = meta.attr("charset"); 271 if (foundCharset != null) 272 break; 273 } 274 275 // look for <?xml encoding='ISO-8859-1'?> 276 if (foundCharset == null && doc.childNodeSize() > 0) { 277 Node first = doc.childNode(0); 278 XmlDeclaration decl = null; 279 if (first instanceof XmlDeclaration) 280 decl = (XmlDeclaration) first; 281 else if (first instanceof Comment) { 282 Comment comment = (Comment) first; 283 if (comment.isXmlDeclaration()) 284 decl = comment.asXmlDeclaration(); 285 } 286 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 287 foundCharset = decl.attr("encoding"); 288 } 289 } 290 foundCharset = validateCharset(foundCharset); 291 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 292 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 293 charsetName = foundCharset; 294 doc = null; 295 } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse 296 input.close(); // the parser tried to close it 297 } else { 298 doc = null; 299 } 300 } else { // specified by content type header (or by user on file load) 301 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 302 } 303 304 // finally: prepare the return struct 305 if (charsetName == null) 306 charsetName = defaultCharsetName; 307 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 308 return new CharsetDoc(charset, doc, input); 309 } 310 311 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 312 // if doc != null it was fully parsed during charset detection; so just return that 313 if (charsetDoc.doc != null) 314 return charsetDoc.doc; 315 316 final InputStream input = charsetDoc.input; 317 Validate.notNull(input); 318 final Document doc; 319 final Charset charset = charsetDoc.charset; 320 try (Reader reader = new SimpleStreamReader(input, charset)) { 321 try { 322 doc = parser.parseInput(reader, baseUri); 323 } catch (UncheckedIOException e) { 324 // io exception when parsing (not seen before because reading the stream as we go) 325 throw e.getCause(); 326 } 327 doc.outputSettings().charset(charset); 328 if (!charset.canEncode()) { 329 // some charsets can read but not encode; switch to an encodable charset and update the meta el 330 doc.charset(UTF_8); 331 } 332 } 333 return doc; 334 } 335 336 /** 337 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 338 * method is executing on. The data read until being interrupted will be available. 339 * @param inStream the input stream to read from 340 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 341 * @return the filled byte buffer 342 * @throws IOException if an exception occurs whilst reading from the input stream. 343 */ 344 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 345 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 346 } 347 348 static ByteBuffer emptyByteBuffer() { 349 return ByteBuffer.allocate(0); 350 } 351 352 /** 353 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 354 * will kick in.) 355 * @param contentType e.g. "text/html; charset=EUC-JP" 356 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 357 */ 358 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 359 if (contentType == null) return null; 360 Matcher m = charsetPattern.matcher(contentType); 361 if (m.find()) { 362 String charset = m.group(1).trim(); 363 charset = charset.replace("charset=", ""); 364 return validateCharset(charset); 365 } 366 return null; 367 } 368 369 private @Nullable static String validateCharset(@Nullable String cs) { 370 if (cs == null || cs.length() == 0) return null; 371 cs = cs.trim().replaceAll("[\"']", ""); 372 try { 373 if (Charset.isSupported(cs)) return cs; 374 cs = cs.toUpperCase(Locale.ENGLISH); 375 if (Charset.isSupported(cs)) return cs; 376 } catch (IllegalCharsetNameException e) { 377 // if all this charset matching fails.... we just take the default 378 } 379 return null; 380 } 381 382 /** 383 * Creates a random string, suitable for use as a mime boundary 384 */ 385 static String mimeBoundary() { 386 final StringBuilder mime = StringUtil.borrowBuilder(); 387 final Random rand = new Random(); 388 for (int i = 0; i < boundaryLength; i++) { 389 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 390 } 391 return StringUtil.releaseBuilder(mime); 392 } 393 394 private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { 395 byte[] bom = new byte[4]; 396 input.mark(bom.length); 397 //noinspection ResultOfMethodCallIgnored 398 input.read(bom, 0, 4); 399 input.reset(); 400 401 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 402 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 403 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 404 return "UTF-32"; // and I hope it's on your system 405 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 406 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 407 return "UTF-16"; // in all Javas 408 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 409 input.read(bom, 0, 3); // consume the UTF-8 BOM 410 return "UTF-8"; // in all Javas 411 } 412 return null; 413 } 414}