001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.StringUtil; 007import org.jsoup.nodes.Comment; 008import org.jsoup.nodes.Document; 009import org.jsoup.nodes.Element; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.XmlDeclaration; 012import org.jsoup.parser.Parser; 013import org.jsoup.parser.StreamParser; 014import org.jsoup.select.Elements; 015import org.jspecify.annotations.Nullable; 016 017import java.io.BufferedReader; 018import java.io.File; 019import java.io.IOException; 020import java.io.InputStream; 021import java.io.InputStreamReader; 022import java.io.OutputStream; 023import java.io.Reader; 024import java.io.UncheckedIOException; 025import java.nio.ByteBuffer; 026import java.nio.channels.Channels; 027import java.nio.channels.SeekableByteChannel; 028import java.nio.charset.Charset; 029import java.nio.charset.IllegalCharsetNameException; 030import java.nio.file.Files; 031import java.nio.file.Path; 032import java.util.Locale; 033import java.util.Random; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036import java.util.zip.GZIPInputStream; 037 038import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 039 040/** 041 * Internal static utilities for handling data. 042 * 043 */ 044@SuppressWarnings("CharsetObjectCanBeUsed") 045public final class DataUtil { 046 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 047 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 048 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 049 private static final int firstReadBufferSize = 1024 * 5; 050 private static final char[] mimeBoundaryChars = 051 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 052 static final int boundaryLength = 32; 053 054 private DataUtil() {} 055 056 /** 057 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 058 * are supported in addition to uncompressed files. 059 * 060 * @param file file to load 061 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 062 * the file will always override this setting. 063 * @param baseUri base URI of document, to resolve relative links against 064 * @return Document 065 * @throws IOException on IO error 066 */ 067 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 068 return load(file.toPath(), charsetName, baseUri); 069 } 070 071 /** 072 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 073 * are supported in addition to uncompressed files. 074 * 075 * @param file file to load 076 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 077 * the file will always override this setting. 078 * @param baseUri base URI of document, to resolve relative links against 079 * @param parser alternate {@link Parser#xmlParser() parser} to use. 080 081 * @return Document 082 * @throws IOException on IO error 083 * @since 1.14.2 084 */ 085 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 086 return load(file.toPath(), charsetName, baseUri, parser); 087 } 088 089 /** 090 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 091 * are supported in addition to uncompressed files. 092 * 093 * @param path file to load 094 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 095 * the file will always override this setting. 096 * @param baseUri base URI of document, to resolve relative links against 097 * @return Document 098 * @throws IOException on IO error 099 */ 100 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 101 return load(path, charsetName, baseUri, Parser.htmlParser()); 102 } 103 104 /** 105 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 106 * are supported in addition to uncompressed files. 107 * 108 * @param path file to load 109 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 110 * the file will always override this setting. 111 * @param baseUri base URI of document, to resolve relative links against 112 * @param parser alternate {@link Parser#xmlParser() parser} to use. 113 114 * @return Document 115 * @throws IOException on IO error 116 * @since 1.17.2 117 */ 118 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 119 return parseInputStream(openStream(path), charsetName, baseUri, parser); 120 } 121 122 /** 123 * Returns a {@link StreamParser} that will parse the supplied file progressively. 124 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 125 * are supported in addition to uncompressed files. 126 * 127 * @param path file to load 128 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 129 * A BOM in the file will always override this setting. 130 * @param baseUri base URI of document, to resolve relative links against 131 * @param parser underlying HTML or XML parser to use. 132 133 * @return Document 134 * @throws IOException on IO error 135 * @since 1.18.2 136 * @see Connection.Response#streamParser() 137 */ 138 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 139 StreamParser streamer = new StreamParser(parser); 140 String charsetName = charset != null? charset.name() : null; 141 try { 142 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); 143 BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); 144 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 145 } catch (IOException e) { 146 streamer.close(); 147 throw e; 148 } 149 return streamer; 150 } 151 152 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 153 private static ControllableInputStream openStream(Path path) throws IOException { 154 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 155 InputStream stream = Channels.newInputStream(byteChannel); 156 String name = Normalizer.lowerCase(path.getFileName().toString()); 157 if (name.endsWith(".gz") || name.endsWith(".z")) { 158 try { 159 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 160 byteChannel.position(0); // reset to start of file 161 if (zipped) stream = new GZIPInputStream(stream); 162 } catch (IOException e) { 163 stream.close(); // error during our first read; close the stream and cascade close byteChannel 164 throw e; 165 } 166 } 167 return ControllableInputStream.wrap(stream, 0); 168 } 169 170 /** 171 * Parses a Document from an input steam. 172 * @param in input stream to parse. The stream will be closed after reading. 173 * @param charsetName character set of input (optional) 174 * @param baseUri base URI of document, to resolve relative links against 175 * @return Document 176 * @throws IOException on IO error 177 */ 178 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 179 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); 180 } 181 182 /** 183 * Parses a Document from an input steam, using the provided Parser. 184 * @param in input stream to parse. The stream will be closed after reading. 185 * @param charsetName character set of input (optional) 186 * @param baseUri base URI of document, to resolve relative links against 187 * @param parser alternate {@link Parser#xmlParser() parser} to use. 188 * @return Document 189 * @throws IOException on IO error 190 */ 191 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 192 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); 193 } 194 195 /** 196 * Writes the input stream to the output stream. Doesn't close them. 197 * @param in input stream to read from 198 * @param out output stream to write to 199 * @throws IOException on IO error 200 */ 201 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 202 final byte[] buffer = new byte[DefaultBufferSize]; 203 int len; 204 while ((len = in.read(buffer)) != -1) { 205 out.write(buffer, 0, len); 206 } 207 } 208 209 /** A struct to return a detected charset, and a document (if fully read). */ 210 static class CharsetDoc { 211 Charset charset; 212 InputStream input; 213 @Nullable Document doc; 214 215 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { 216 this.charset = charset; 217 this.input = input; 218 this.doc = doc; 219 } 220 } 221 222 static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 223 if (input == null) // empty body // todo reconsider? 224 return new Document(baseUri); 225 226 final Document doc; 227 CharsetDoc charsetDoc = null; 228 try { 229 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 230 doc = parseInputStream(charsetDoc, baseUri, parser); 231 } finally { 232 if (charsetDoc != null) 233 charsetDoc.input.close(); 234 } 235 return doc; 236 } 237 238 static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 239 Document doc = null; 240 // read the start of the stream and look for a BOM or meta charset: 241 // look for BOM - overrides any other header or input 242 String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately 243 if (bomCharset != null) 244 charsetName = bomCharset; 245 246 if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 247 int origMax = input.max(); 248 input.max(firstReadBufferSize); 249 input.mark(firstReadBufferSize); 250 input.allowClose(false); // ignores closes during parse, in case we need to rewind 251 try { 252 Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize 253 doc = parser.parseInput(reader, baseUri); 254 input.reset(); 255 input.max(origMax); // reset for a full read if required 256 } catch (UncheckedIOException e) { 257 throw e.getCause(); 258 } finally { 259 input.allowClose(true); 260 } 261 262 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 263 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); 264 String foundCharset = null; // if not found, will keep utf-8 as best attempt 265 for (Element meta : metaElements) { 266 if (meta.hasAttr("http-equiv")) 267 foundCharset = getCharsetFromContentType(meta.attr("content")); 268 if (foundCharset == null && meta.hasAttr("charset")) 269 foundCharset = meta.attr("charset"); 270 if (foundCharset != null) 271 break; 272 } 273 274 // look for <?xml encoding='ISO-8859-1'?> 275 if (foundCharset == null && doc.childNodeSize() > 0) { 276 Node first = doc.childNode(0); 277 XmlDeclaration decl = null; 278 if (first instanceof XmlDeclaration) 279 decl = (XmlDeclaration) first; 280 else if (first instanceof Comment) { 281 Comment comment = (Comment) first; 282 if (comment.isXmlDeclaration()) 283 decl = comment.asXmlDeclaration(); 284 } 285 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 286 foundCharset = decl.attr("encoding"); 287 } 288 } 289 foundCharset = validateCharset(foundCharset); 290 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 291 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 292 charsetName = foundCharset; 293 doc = null; 294 } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse 295 input.close(); // the parser tried to close it 296 } else { 297 doc = null; 298 } 299 } else { // specified by content type header (or by user on file load) 300 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 301 } 302 303 // finally: prepare the return struct 304 if (charsetName == null) 305 charsetName = defaultCharsetName; 306 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 307 return new CharsetDoc(charset, doc, input); 308 } 309 310 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 311 // if doc != null it was fully parsed during charset detection; so just return that 312 if (charsetDoc.doc != null) 313 return charsetDoc.doc; 314 315 final InputStream input = charsetDoc.input; 316 Validate.notNull(input); 317 final Document doc; 318 final Charset charset = charsetDoc.charset; 319 try (Reader reader = new InputStreamReader(input, charset)) { 320 try { 321 doc = parser.parseInput(reader, baseUri); 322 } catch (UncheckedIOException e) { 323 // io exception when parsing (not seen before because reading the stream as we go) 324 throw e.getCause(); 325 } 326 doc.outputSettings().charset(charset); 327 if (!charset.canEncode()) { 328 // some charsets can read but not encode; switch to an encodable charset and update the meta el 329 doc.charset(UTF_8); 330 } 331 } 332 return doc; 333 } 334 335 /** 336 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 337 * method is executing on. The data read until being interrupted will be available. 338 * @param inStream the input stream to read from 339 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 340 * @return the filled byte buffer 341 * @throws IOException if an exception occurs whilst reading from the input stream. 342 */ 343 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 344 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 345 } 346 347 static ByteBuffer emptyByteBuffer() { 348 return ByteBuffer.allocate(0); 349 } 350 351 /** 352 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 353 * will kick in.) 354 * @param contentType e.g. "text/html; charset=EUC-JP" 355 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 356 */ 357 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 358 if (contentType == null) return null; 359 Matcher m = charsetPattern.matcher(contentType); 360 if (m.find()) { 361 String charset = m.group(1).trim(); 362 charset = charset.replace("charset=", ""); 363 return validateCharset(charset); 364 } 365 return null; 366 } 367 368 private @Nullable static String validateCharset(@Nullable String cs) { 369 if (cs == null || cs.length() == 0) return null; 370 cs = cs.trim().replaceAll("[\"']", ""); 371 try { 372 if (Charset.isSupported(cs)) return cs; 373 cs = cs.toUpperCase(Locale.ENGLISH); 374 if (Charset.isSupported(cs)) return cs; 375 } catch (IllegalCharsetNameException e) { 376 // if all this charset matching fails.... we just take the default 377 } 378 return null; 379 } 380 381 /** 382 * Creates a random string, suitable for use as a mime boundary 383 */ 384 static String mimeBoundary() { 385 final StringBuilder mime = StringUtil.borrowBuilder(); 386 final Random rand = new Random(); 387 for (int i = 0; i < boundaryLength; i++) { 388 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 389 } 390 return StringUtil.releaseBuilder(mime); 391 } 392 393 private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { 394 byte[] bom = new byte[4]; 395 input.mark(bom.length); 396 //noinspection ResultOfMethodCallIgnored 397 input.read(bom, 0, 4); 398 input.reset(); 399 400 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 401 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 402 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 403 return "UTF-32"; // and I hope it's on your system 404 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 405 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 406 return "UTF-16"; // in all Javas 407 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 408 input.read(bom, 0, 3); // consume the UTF-8 BOM 409 return "UTF-8"; // in all Javas 410 } 411 return null; 412 } 413}