001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.StringUtil;
007import org.jsoup.nodes.Comment;
008import org.jsoup.nodes.Document;
009import org.jsoup.nodes.Element;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.XmlDeclaration;
012import org.jsoup.parser.Parser;
013import org.jsoup.parser.StreamParser;
014import org.jsoup.select.Elements;
015import org.jspecify.annotations.Nullable;
016
017import java.io.BufferedReader;
018import java.io.File;
019import java.io.IOException;
020import java.io.InputStream;
021import java.io.InputStreamReader;
022import java.io.OutputStream;
023import java.io.Reader;
024import java.io.UncheckedIOException;
025import java.nio.ByteBuffer;
026import java.nio.channels.Channels;
027import java.nio.channels.SeekableByteChannel;
028import java.nio.charset.Charset;
029import java.nio.charset.IllegalCharsetNameException;
030import java.nio.file.Files;
031import java.nio.file.Path;
032import java.util.Locale;
033import java.util.Random;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036import java.util.zip.GZIPInputStream;
037
038import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
039
040/**
041 * Internal static utilities for handling data.
042 *
043 */
044@SuppressWarnings("CharsetObjectCanBeUsed")
045public final class DataUtil {
046    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
047    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
048    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
049    private static final int firstReadBufferSize = 1024 * 5;
050    private static final char[] mimeBoundaryChars =
051            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
052    static final int boundaryLength = 32;
053
054    private DataUtil() {}
055
056    /**
057     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
058     * are supported in addition to uncompressed files.
059     *
060     * @param file file to load
061     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
062     *     the file will always override this setting.
063     * @param baseUri base URI of document, to resolve relative links against
064     * @return Document
065     * @throws IOException on IO error
066     */
067    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
068        return load(file.toPath(), charsetName, baseUri);
069    }
070
071    /**
072     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
073     * are supported in addition to uncompressed files.
074     *
075     * @param file file to load
076     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
077     *     the file will always override this setting.
078     * @param baseUri base URI of document, to resolve relative links against
079     * @param parser alternate {@link Parser#xmlParser() parser} to use.
080
081     * @return Document
082     * @throws IOException on IO error
083     * @since 1.14.2
084     */
085    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
086        return load(file.toPath(), charsetName, baseUri, parser);
087    }
088
089    /**
090     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
091     * are supported in addition to uncompressed files.
092     *
093     * @param path file to load
094     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
095     *     the file will always override this setting.
096     * @param baseUri base URI of document, to resolve relative links against
097     * @return Document
098     * @throws IOException on IO error
099     */
100    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
101        return load(path, charsetName, baseUri, Parser.htmlParser());
102    }
103
104    /**
105     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
106     * are supported in addition to uncompressed files.
107     *
108     * @param path file to load
109     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
110     * the file will always override this setting.
111     * @param baseUri base URI of document, to resolve relative links against
112     * @param parser alternate {@link Parser#xmlParser() parser} to use.
113
114     * @return Document
115     * @throws IOException on IO error
116     * @since 1.17.2
117     */
118    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
119        return parseInputStream(openStream(path), charsetName, baseUri, parser);
120    }
121
122    /**
123     * Returns a {@link StreamParser} that will parse the supplied file progressively.
124     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
125     * are supported in addition to uncompressed files.
126     *
127     * @param path file to load
128     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
129     * A BOM in the file will always override this setting.
130     * @param baseUri base URI of document, to resolve relative links against
131     * @param parser underlying HTML or XML parser to use.
132
133     * @return Document
134     * @throws IOException on IO error
135     * @since 1.18.2
136     * @see Connection.Response#streamParser()
137     */
138    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
139        StreamParser streamer = new StreamParser(parser);
140        String charsetName = charset != null? charset.name() : null;
141        try {
142            DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
143            BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
144            streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
145        } catch (IOException e) {
146            streamer.close();
147            throw e;
148        }
149        return streamer;
150    }
151
152    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
153    private static ControllableInputStream openStream(Path path) throws IOException {
154        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
155        InputStream stream = Channels.newInputStream(byteChannel);
156        String name = Normalizer.lowerCase(path.getFileName().toString());
157        if (name.endsWith(".gz") || name.endsWith(".z")) {
158            try {
159                final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
160                byteChannel.position(0); // reset to start of file
161                if (zipped) stream = new GZIPInputStream(stream);
162            } catch (IOException e) {
163                stream.close(); // error during our first read; close the stream and cascade close byteChannel
164                throw e;
165            }
166        }
167        return ControllableInputStream.wrap(stream, 0);
168    }
169
170    /**
171     * Parses a Document from an input steam.
172     * @param in input stream to parse. The stream will be closed after reading.
173     * @param charsetName character set of input (optional)
174     * @param baseUri base URI of document, to resolve relative links against
175     * @return Document
176     * @throws IOException on IO error
177     */
178    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
179        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
180    }
181
182    /**
183     * Parses a Document from an input steam, using the provided Parser.
184     * @param in input stream to parse. The stream will be closed after reading.
185     * @param charsetName character set of input (optional)
186     * @param baseUri base URI of document, to resolve relative links against
187     * @param parser alternate {@link Parser#xmlParser() parser} to use.
188     * @return Document
189     * @throws IOException on IO error
190     */
191    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
192        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
193    }
194
195    /**
196     * Writes the input stream to the output stream. Doesn't close them.
197     * @param in input stream to read from
198     * @param out output stream to write to
199     * @throws IOException on IO error
200     */
201    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
202        final byte[] buffer = new byte[DefaultBufferSize];
203        int len;
204        while ((len = in.read(buffer)) != -1) {
205            out.write(buffer, 0, len);
206        }
207    }
208
209    /** A struct to return a detected charset, and a document (if fully read). */
210    static class CharsetDoc {
211        Charset charset;
212        InputStream input;
213        @Nullable Document doc;
214
215        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
216            this.charset = charset;
217            this.input = input;
218            this.doc = doc;
219        }
220    }
221
222    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
223        if (input == null) // empty body // todo reconsider?
224            return new Document(baseUri);
225
226        final Document doc;
227        CharsetDoc charsetDoc = null;
228        try {
229            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
230            doc = parseInputStream(charsetDoc, baseUri, parser);
231        } finally {
232            if (charsetDoc != null)
233                charsetDoc.input.close();
234        }
235        return doc;
236    }
237
238    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
239        Document doc = null;
240        // read the start of the stream and look for a BOM or meta charset:
241        // look for BOM - overrides any other header or input
242        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
243        if (bomCharset != null)
244            charsetName = bomCharset;
245
246        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
247            int origMax = input.max();
248            input.max(firstReadBufferSize);
249            input.mark(firstReadBufferSize);
250            input.allowClose(false); // ignores closes during parse, in case we need to rewind
251            try {
252                Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
253                doc = parser.parseInput(reader, baseUri);
254                input.reset();
255                input.max(origMax); // reset for a full read if required
256            } catch (UncheckedIOException e) {
257                throw e.getCause();
258            } finally {
259                input.allowClose(true);
260            }
261
262            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
263            Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
264            String foundCharset = null; // if not found, will keep utf-8 as best attempt
265            for (Element meta : metaElements) {
266                if (meta.hasAttr("http-equiv"))
267                    foundCharset = getCharsetFromContentType(meta.attr("content"));
268                if (foundCharset == null && meta.hasAttr("charset"))
269                    foundCharset = meta.attr("charset");
270                if (foundCharset != null)
271                    break;
272            }
273
274            // look for <?xml encoding='ISO-8859-1'?>
275            if (foundCharset == null && doc.childNodeSize() > 0) {
276                Node first = doc.childNode(0);
277                XmlDeclaration decl = null;
278                if (first instanceof XmlDeclaration)
279                    decl = (XmlDeclaration) first;
280                else if (first instanceof Comment) {
281                    Comment comment = (Comment) first;
282                    if (comment.isXmlDeclaration())
283                        decl = comment.asXmlDeclaration();
284                }
285                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
286                    foundCharset = decl.attr("encoding");
287                }
288            }
289            foundCharset = validateCharset(foundCharset);
290            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
291                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
292                charsetName = foundCharset;
293                doc = null;
294            } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
295                input.close(); // the parser tried to close it
296            } else {
297                doc = null;
298            }
299        } else { // specified by content type header (or by user on file load)
300            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
301        }
302
303        // finally: prepare the return struct
304        if (charsetName == null)
305            charsetName = defaultCharsetName;
306        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
307        return new CharsetDoc(charset, doc, input);
308    }
309
310    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
311        // if doc != null it was fully parsed during charset detection; so just return that
312        if (charsetDoc.doc != null)
313            return charsetDoc.doc;
314
315        final InputStream input = charsetDoc.input;
316        Validate.notNull(input);
317        final Document doc;
318        final Charset charset = charsetDoc.charset;
319        try (Reader reader = new InputStreamReader(input, charset)) {
320            try {
321                doc = parser.parseInput(reader, baseUri);
322            } catch (UncheckedIOException e) {
323                // io exception when parsing (not seen before because reading the stream as we go)
324                throw e.getCause();
325            }
326            doc.outputSettings().charset(charset);
327            if (!charset.canEncode()) {
328                // some charsets can read but not encode; switch to an encodable charset and update the meta el
329                doc.charset(UTF_8);
330            }
331        }
332        return doc;
333    }
334
335    /**
336     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
337     * method is executing on. The data read until being interrupted will be available.
338     * @param inStream the input stream to read from
339     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
340     * @return the filled byte buffer
341     * @throws IOException if an exception occurs whilst reading from the input stream.
342     */
343    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
344        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
345    }
346
347    static ByteBuffer emptyByteBuffer() {
348        return ByteBuffer.allocate(0);
349    }
350
351    /**
352     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
353     * will kick in.)
354     * @param contentType e.g. "text/html; charset=EUC-JP"
355     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
356     */
357    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
358        if (contentType == null) return null;
359        Matcher m = charsetPattern.matcher(contentType);
360        if (m.find()) {
361            String charset = m.group(1).trim();
362            charset = charset.replace("charset=", "");
363            return validateCharset(charset);
364        }
365        return null;
366    }
367
368    private @Nullable static String validateCharset(@Nullable String cs) {
369        if (cs == null || cs.length() == 0) return null;
370        cs = cs.trim().replaceAll("[\"']", "");
371        try {
372            if (Charset.isSupported(cs)) return cs;
373            cs = cs.toUpperCase(Locale.ENGLISH);
374            if (Charset.isSupported(cs)) return cs;
375        } catch (IllegalCharsetNameException e) {
376            // if all this charset matching fails.... we just take the default
377        }
378        return null;
379    }
380
381    /**
382     * Creates a random string, suitable for use as a mime boundary
383     */
384    static String mimeBoundary() {
385        final StringBuilder mime = StringUtil.borrowBuilder();
386        final Random rand = new Random();
387        for (int i = 0; i < boundaryLength; i++) {
388            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
389        }
390        return StringUtil.releaseBuilder(mime);
391    }
392
393    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
394        byte[] bom = new byte[4];
395        input.mark(bom.length);
396        //noinspection ResultOfMethodCallIgnored
397        input.read(bom, 0, 4);
398        input.reset();
399
400        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
401        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
402            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
403            return "UTF-32"; // and I hope it's on your system
404        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
405            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
406            return "UTF-16"; // in all Javas
407        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
408            input.read(bom, 0, 3); // consume the UTF-8 BOM
409            return "UTF-8"; // in all Javas
410        }
411        return null;
412    }
413}