001package org.jsoup.helper; 002 003import org.jsoup.Connection; 004import org.jsoup.internal.ControllableInputStream; 005import org.jsoup.internal.Normalizer; 006import org.jsoup.internal.StringUtil; 007import org.jsoup.nodes.Comment; 008import org.jsoup.nodes.Document; 009import org.jsoup.nodes.Element; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.XmlDeclaration; 012import org.jsoup.parser.Parser; 013import org.jsoup.parser.StreamParser; 014import org.jsoup.select.Elements; 015import org.jspecify.annotations.Nullable; 016 017import java.io.BufferedReader; 018import java.io.File; 019import java.io.IOException; 020import java.io.InputStream; 021import java.io.InputStreamReader; 022import java.io.OutputStream; 023import java.io.Reader; 024import java.io.UncheckedIOException; 025import java.nio.ByteBuffer; 026import java.nio.channels.Channels; 027import java.nio.channels.SeekableByteChannel; 028import java.nio.charset.Charset; 029import java.nio.charset.IllegalCharsetNameException; 030import java.nio.file.Files; 031import java.nio.file.Path; 032import java.util.Locale; 033import java.util.Random; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036import java.util.zip.GZIPInputStream; 037 038import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 039 040/** 041 * Internal static utilities for handling data. 042 * 043 */ 044@SuppressWarnings("CharsetObjectCanBeUsed") 045public final class DataUtil { 046 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 047 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 048 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 049 private static final int firstReadBufferSize = 1024 * 5; 050 private static final char[] mimeBoundaryChars = 051 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 052 static final int boundaryLength = 32; 053 054 private DataUtil() {} 055 056 /** 057 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 058 * are supported in addition to uncompressed files. 059 * 060 * @param file file to load 061 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 062 * the file will always override this setting. 063 * @param baseUri base URI of document, to resolve relative links against 064 * @return Document 065 * @throws IOException on IO error 066 */ 067 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 068 return load(file.toPath(), charsetName, baseUri); 069 } 070 071 /** 072 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 073 * are supported in addition to uncompressed files. 074 * 075 * @param file file to load 076 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 077 * the file will always override this setting. 078 * @param baseUri base URI of document, to resolve relative links against 079 * @param parser alternate {@link Parser#xmlParser() parser} to use. 080 081 * @return Document 082 * @throws IOException on IO error 083 * @since 1.14.2 084 */ 085 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 086 return load(file.toPath(), charsetName, baseUri, parser); 087 } 088 089 /** 090 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 091 * are supported in addition to uncompressed files. 092 * 093 * @param path file to load 094 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 095 * the file will always override this setting. 096 * @param baseUri base URI of document, to resolve relative links against 097 * @return Document 098 * @throws IOException on IO error 099 */ 100 public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException { 101 return load(path, charsetName, baseUri, Parser.htmlParser()); 102 } 103 104 /** 105 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 106 * are supported in addition to uncompressed files. 107 * 108 * @param path file to load 109 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 110 * the file will always override this setting. 111 * @param baseUri base URI of document, to resolve relative links against 112 * @param parser alternate {@link Parser#xmlParser() parser} to use. 113 114 * @return Document 115 * @throws IOException on IO error 116 * @since 1.17.2 117 */ 118 public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 119 return parseInputStream(openStream(path), charsetName, baseUri, parser); 120 } 121 122 /** 123 * Returns a {@link StreamParser} that will parse the supplied file progressively. 124 * Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 125 * are supported in addition to uncompressed files. 126 * 127 * @param path file to load 128 * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata. 129 * A BOM in the file will always override this setting. 130 * @param baseUri base URI of document, to resolve relative links against 131 * @param parser alternate {@link Parser#xmlParser() parser} to use. 132 133 * @return Document 134 * @throws IOException on IO error 135 * @since 1.18.2 136 * @see Connection.Response#streamParser() 137 */ 138 public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException { 139 StreamParser streamer = new StreamParser(parser); 140 String charsetName = charset != null? charset.name() : null; 141 DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser); 142 BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize); 143 streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it 144 145 return streamer; 146 } 147 148 /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */ 149 private static ControllableInputStream openStream(Path path) throws IOException { 150 final SeekableByteChannel byteChannel = Files.newByteChannel(path); 151 InputStream stream = Channels.newInputStream(byteChannel); 152 String name = Normalizer.lowerCase(path.getFileName().toString()); 153 if (name.endsWith(".gz") || name.endsWith(".z")) { 154 final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 155 byteChannel.position(0); // reset to start of file 156 if (zipped) { 157 stream = new GZIPInputStream(stream); 158 } 159 } 160 return ControllableInputStream.wrap(stream, 0); 161 } 162 163 /** 164 * Parses a Document from an input steam. 165 * @param in input stream to parse. The stream will be closed after reading. 166 * @param charsetName character set of input (optional) 167 * @param baseUri base URI of document, to resolve relative links against 168 * @return Document 169 * @throws IOException on IO error 170 */ 171 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 172 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser()); 173 } 174 175 /** 176 * Parses a Document from an input steam, using the provided Parser. 177 * @param in input stream to parse. The stream will be closed after reading. 178 * @param charsetName character set of input (optional) 179 * @param baseUri base URI of document, to resolve relative links against 180 * @param parser alternate {@link Parser#xmlParser() parser} to use. 181 * @return Document 182 * @throws IOException on IO error 183 */ 184 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 185 return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser); 186 } 187 188 /** 189 * Writes the input stream to the output stream. Doesn't close them. 190 * @param in input stream to read from 191 * @param out output stream to write to 192 * @throws IOException on IO error 193 */ 194 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 195 final byte[] buffer = new byte[DefaultBufferSize]; 196 int len; 197 while ((len = in.read(buffer)) != -1) { 198 out.write(buffer, 0, len); 199 } 200 } 201 202 /** A struct to return a detected charset, and a document (if fully read). */ 203 static class CharsetDoc { 204 Charset charset; 205 InputStream input; 206 @Nullable Document doc; 207 208 CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) { 209 this.charset = charset; 210 this.input = input; 211 this.doc = doc; 212 } 213 } 214 215 static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 216 if (input == null) // empty body // todo reconsider? 217 return new Document(baseUri); 218 219 final Document doc; 220 CharsetDoc charsetDoc = null; 221 try { 222 charsetDoc = detectCharset(input, charsetName, baseUri, parser); 223 doc = parseInputStream(charsetDoc, baseUri, parser); 224 } finally { 225 if (charsetDoc != null) 226 charsetDoc.input.close(); 227 } 228 return doc; 229 } 230 231 static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 232 Document doc = null; 233 // read the start of the stream and look for a BOM or meta charset: 234 // look for BOM - overrides any other header or input 235 String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately 236 if (bomCharset != null) 237 charsetName = bomCharset; 238 239 if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 240 int origMax = input.max(); 241 input.max(firstReadBufferSize); 242 input.mark(firstReadBufferSize); 243 input.allowClose(false); // ignores closes during parse, in case we need to rewind 244 try { 245 Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize 246 doc = parser.parseInput(reader, baseUri); 247 input.reset(); 248 input.max(origMax); // reset for a full read if required 249 } catch (UncheckedIOException e) { 250 throw e.getCause(); 251 } finally { 252 input.allowClose(true); 253 } 254 255 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 256 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); 257 String foundCharset = null; // if not found, will keep utf-8 as best attempt 258 for (Element meta : metaElements) { 259 if (meta.hasAttr("http-equiv")) 260 foundCharset = getCharsetFromContentType(meta.attr("content")); 261 if (foundCharset == null && meta.hasAttr("charset")) 262 foundCharset = meta.attr("charset"); 263 if (foundCharset != null) 264 break; 265 } 266 267 // look for <?xml encoding='ISO-8859-1'?> 268 if (foundCharset == null && doc.childNodeSize() > 0) { 269 Node first = doc.childNode(0); 270 XmlDeclaration decl = null; 271 if (first instanceof XmlDeclaration) 272 decl = (XmlDeclaration) first; 273 else if (first instanceof Comment) { 274 Comment comment = (Comment) first; 275 if (comment.isXmlDeclaration()) 276 decl = comment.asXmlDeclaration(); 277 } 278 if (decl != null && decl.name().equalsIgnoreCase("xml")) { 279 foundCharset = decl.attr("encoding"); 280 } 281 } 282 foundCharset = validateCharset(foundCharset); 283 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works) 284 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 285 charsetName = foundCharset; 286 doc = null; 287 } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse 288 input.close(); // the parser tried to close it 289 } else { 290 doc = null; 291 } 292 } else { // specified by content type header (or by user on file load) 293 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 294 } 295 296 // finally: prepare the return struct 297 if (charsetName == null) 298 charsetName = defaultCharsetName; 299 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 300 return new CharsetDoc(charset, doc, input); 301 } 302 303 static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException { 304 // if doc != null it was fully parsed during charset detection; so just return that 305 if (charsetDoc.doc != null) 306 return charsetDoc.doc; 307 308 final InputStream input = charsetDoc.input; 309 Validate.notNull(input); 310 final Document doc; 311 final Charset charset = charsetDoc.charset; 312 try (Reader reader = new InputStreamReader(input, charset)) { 313 try { 314 doc = parser.parseInput(reader, baseUri); 315 } catch (UncheckedIOException e) { 316 // io exception when parsing (not seen before because reading the stream as we go) 317 throw e.getCause(); 318 } 319 doc.outputSettings().charset(charset); 320 if (!charset.canEncode()) { 321 // some charsets can read but not encode; switch to an encodable charset and update the meta el 322 doc.charset(UTF_8); 323 } 324 } 325 return doc; 326 } 327 328 /** 329 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 330 * method is executing on. The data read until being interrupted will be available. 331 * @param inStream the input stream to read from 332 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 333 * @return the filled byte buffer 334 * @throws IOException if an exception occurs whilst reading from the input stream. 335 */ 336 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 337 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 338 } 339 340 static ByteBuffer emptyByteBuffer() { 341 return ByteBuffer.allocate(0); 342 } 343 344 /** 345 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 346 * will kick in.) 347 * @param contentType e.g. "text/html; charset=EUC-JP" 348 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 349 */ 350 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 351 if (contentType == null) return null; 352 Matcher m = charsetPattern.matcher(contentType); 353 if (m.find()) { 354 String charset = m.group(1).trim(); 355 charset = charset.replace("charset=", ""); 356 return validateCharset(charset); 357 } 358 return null; 359 } 360 361 private @Nullable static String validateCharset(@Nullable String cs) { 362 if (cs == null || cs.length() == 0) return null; 363 cs = cs.trim().replaceAll("[\"']", ""); 364 try { 365 if (Charset.isSupported(cs)) return cs; 366 cs = cs.toUpperCase(Locale.ENGLISH); 367 if (Charset.isSupported(cs)) return cs; 368 } catch (IllegalCharsetNameException e) { 369 // if all this charset matching fails.... we just take the default 370 } 371 return null; 372 } 373 374 /** 375 * Creates a random string, suitable for use as a mime boundary 376 */ 377 static String mimeBoundary() { 378 final StringBuilder mime = StringUtil.borrowBuilder(); 379 final Random rand = new Random(); 380 for (int i = 0; i < boundaryLength; i++) { 381 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 382 } 383 return StringUtil.releaseBuilder(mime); 384 } 385 386 private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException { 387 byte[] bom = new byte[4]; 388 input.mark(bom.length); 389 //noinspection ResultOfMethodCallIgnored 390 input.read(bom, 0, 4); 391 input.reset(); 392 393 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 394 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 395 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 396 return "UTF-32"; // and I hope it's on your system 397 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 398 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 399 return "UTF-16"; // in all Javas 400 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 401 input.read(bom, 0, 3); // consume the UTF-8 BOM 402 return "UTF-8"; // in all Javas 403 } 404 return null; 405 } 406}