001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.SimpleStreamReader;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.nodes.Comment;
009import org.jsoup.nodes.Document;
010import org.jsoup.nodes.Element;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.XmlDeclaration;
013import org.jsoup.parser.Parser;
014import org.jsoup.parser.StreamParser;
015import org.jsoup.select.Elements;
016import org.jsoup.select.Evaluator;
017import org.jsoup.select.Selector;
018import org.jspecify.annotations.Nullable;
019
020import java.io.File;
021import java.io.IOException;
022import java.io.InputStream;
023import java.io.OutputStream;
024import java.io.Reader;
025import java.io.UncheckedIOException;
026import java.nio.ByteBuffer;
027import java.nio.channels.Channels;
028import java.nio.channels.SeekableByteChannel;
029import java.nio.charset.Charset;
030import java.nio.charset.IllegalCharsetNameException;
031import java.nio.file.Files;
032import java.nio.file.Path;
033import java.util.Locale;
034import java.util.Random;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037import java.util.zip.GZIPInputStream;
038
039import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
040
041/**
042 * Internal static utilities for handling data.
043 *
044 */
045@SuppressWarnings("CharsetObjectCanBeUsed")
046public final class DataUtil {
047    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
048    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
049    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
050    private static final int firstReadBufferSize = 1024 * 5;
051    private static final char[] mimeBoundaryChars =
052            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
053    static final int boundaryLength = 32;
054
055    private DataUtil() {}
056
057    /**
058     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
059     * are supported in addition to uncompressed files.
060     *
061     * @param file file to load
062     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
063     *     the file will always override this setting.
064     * @param baseUri base URI of document, to resolve relative links against
065     * @return Document
066     * @throws IOException on IO error
067     */
068    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
069        return load(file.toPath(), charsetName, baseUri);
070    }
071
072    /**
073     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
074     * are supported in addition to uncompressed files.
075     *
076     * @param file file to load
077     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
078     *     the file will always override this setting.
079     * @param baseUri base URI of document, to resolve relative links against
080     * @param parser alternate {@link Parser#xmlParser() parser} to use.
081
082     * @return Document
083     * @throws IOException on IO error
084     * @since 1.14.2
085     */
086    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
087        return load(file.toPath(), charsetName, baseUri, parser);
088    }
089
090    /**
091     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
092     * are supported in addition to uncompressed files.
093     *
094     * @param path file to load
095     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
096     *     the file will always override this setting.
097     * @param baseUri base URI of document, to resolve relative links against
098     * @return Document
099     * @throws IOException on IO error
100     */
101    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
102        return load(path, charsetName, baseUri, Parser.htmlParser());
103    }
104
105    /**
106     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
107     * are supported in addition to uncompressed files.
108     *
109     * @param path file to load
110     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
111     * the file will always override this setting.
112     * @param baseUri base URI of document, to resolve relative links against
113     * @param parser alternate {@link Parser#xmlParser() parser} to use.
114
115     * @return Document
116     * @throws IOException on IO error
117     * @since 1.17.2
118     */
119    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
120        return parseInputStream(openStream(path), charsetName, baseUri, parser);
121    }
122
123    /**
124     * Returns a {@link StreamParser} that will parse the supplied file progressively.
125     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
126     * are supported in addition to uncompressed files.
127     *
128     * @param path file to load
129     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
130     * A BOM in the file will always override this setting.
131     * @param baseUri base URI of document, to resolve relative links against
132     * @param parser underlying HTML or XML parser to use.
133
134     * @return Document
135     * @throws IOException on IO error
136     * @since 1.18.2
137     * @see Connection.Response#streamParser()
138     */
139    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
140        StreamParser streamer = new StreamParser(parser);
141        String charsetName = charset != null? charset.name() : null;
142        try {
143            DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
144            Reader reader = new SimpleStreamReader(charsetDoc.input, charsetDoc.charset);
145            streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
146        } catch (IOException e) {
147            streamer.close();
148            throw e;
149        }
150        return streamer;
151    }
152
153    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
154    private static ControllableInputStream openStream(Path path) throws IOException {
155        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
156        InputStream stream = Channels.newInputStream(byteChannel);
157        String name = Normalizer.lowerCase(path.getFileName().toString());
158        if (name.endsWith(".gz") || name.endsWith(".z")) {
159            try {
160                final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
161                byteChannel.position(0); // reset to start of file
162                if (zipped) stream = new GZIPInputStream(stream);
163            } catch (IOException e) {
164                stream.close(); // error during our first read; close the stream and cascade close byteChannel
165                throw e;
166            }
167        }
168        return ControllableInputStream.wrap(stream, 0);
169    }
170
171    /**
172     * Parses a Document from an input steam.
173     * @param in input stream to parse. The stream will be closed after reading.
174     * @param charsetName character set of input (optional)
175     * @param baseUri base URI of document, to resolve relative links against
176     * @return Document
177     * @throws IOException on IO error
178     */
179    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
180        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
181    }
182
183    /**
184     * Parses a Document from an input steam, using the provided Parser.
185     * @param in input stream to parse. The stream will be closed after reading.
186     * @param charsetName character set of input (optional)
187     * @param baseUri base URI of document, to resolve relative links against
188     * @param parser alternate {@link Parser#xmlParser() parser} to use.
189     * @return Document
190     * @throws IOException on IO error
191     */
192    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
193        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
194    }
195
196    /**
197     * Writes the input stream to the output stream. Doesn't close them.
198     * @param in input stream to read from
199     * @param out output stream to write to
200     * @throws IOException on IO error
201     */
202    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
203        final byte[] buffer = new byte[DefaultBufferSize];
204        int len;
205        while ((len = in.read(buffer)) != -1) {
206            out.write(buffer, 0, len);
207        }
208    }
209
210    /** A struct to return a detected charset, and a document (if fully read). */
211    static class CharsetDoc {
212        Charset charset;
213        InputStream input;
214        @Nullable Document doc;
215
216        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
217            this.charset = charset;
218            this.input = input;
219            this.doc = doc;
220        }
221    }
222
223    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
224        if (input == null) return new Document(baseUri); // empty body
225
226        final Document doc;
227        CharsetDoc charsetDoc = null;
228        try {
229            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
230            doc = parseInputStream(charsetDoc, baseUri, parser);
231        } finally {
232            if (charsetDoc != null)
233                charsetDoc.input.close();
234        }
235        return doc;
236    }
237
238    private static final Evaluator metaCharset = Selector.evaluatorOf("meta[http-equiv=content-type], meta[charset]");
239
240    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
241        Document doc = null;
242        // read the start of the stream and look for a BOM or meta charset:
243        // look for BOM - overrides any other header or input
244        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
245        if (bomCharset != null)
246            charsetName = bomCharset;
247
248        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
249            int origMax = input.max();
250            input.max(firstReadBufferSize);
251            input.mark(firstReadBufferSize);
252            input.allowClose(false); // ignores closes during parse, in case we need to rewind
253            try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize
254                doc = parser.parseInput(reader, baseUri);
255                input.reset();
256                input.max(origMax); // reset for a full read if required
257            } catch (UncheckedIOException e) {
258                throw e.getCause();
259            } finally {
260                input.allowClose(true);
261            }
262
263            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
264            Elements metaElements = doc.select(metaCharset);
265            String foundCharset = null; // if not found, will keep utf-8 as best attempt
266            for (Element meta : metaElements) {
267                if (meta.hasAttr("http-equiv"))
268                    foundCharset = getCharsetFromContentType(meta.attr("content"));
269                if (foundCharset == null && meta.hasAttr("charset"))
270                    foundCharset = meta.attr("charset");
271                if (foundCharset != null)
272                    break;
273            }
274
275            // look for <?xml encoding='ISO-8859-1'?>
276            if (foundCharset == null && doc.childNodeSize() > 0) {
277                Node first = doc.childNode(0);
278                XmlDeclaration decl = null;
279                if (first instanceof XmlDeclaration)
280                    decl = (XmlDeclaration) first;
281                else if (first instanceof Comment) {
282                    Comment comment = (Comment) first;
283                    if (comment.isXmlDeclaration())
284                        decl = comment.asXmlDeclaration();
285                }
286                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
287                    foundCharset = decl.attr("encoding");
288                }
289            }
290            foundCharset = validateCharset(foundCharset);
291            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
292                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
293                charsetName = foundCharset;
294                doc = null;
295            } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
296                input.close(); // the parser tried to close it
297            } else {
298                doc = null;
299            }
300        } else { // specified by content type header (or by user on file load)
301            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
302        }
303
304        // finally: prepare the return struct
305        if (charsetName == null)
306            charsetName = defaultCharsetName;
307        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
308        return new CharsetDoc(charset, doc, input);
309    }
310
311    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
312        // if doc != null it was fully parsed during charset detection; so just return that
313        if (charsetDoc.doc != null)
314            return charsetDoc.doc;
315
316        final InputStream input = charsetDoc.input;
317        Validate.notNull(input);
318        final Document doc;
319        final Charset charset = charsetDoc.charset;
320        try (Reader reader = new SimpleStreamReader(input, charset)) {
321            try {
322                doc = parser.parseInput(reader, baseUri);
323            } catch (UncheckedIOException e) {
324                // io exception when parsing (not seen before because reading the stream as we go)
325                throw e.getCause();
326            }
327            doc.outputSettings().charset(charset);
328            if (!charset.canEncode()) {
329                // some charsets can read but not encode; switch to an encodable charset and update the meta el
330                doc.charset(UTF_8);
331            }
332        }
333        return doc;
334    }
335
336    /**
337     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
338     * method is executing on. The data read until being interrupted will be available.
339     * @param inStream the input stream to read from
340     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
341     * @return the filled byte buffer
342     * @throws IOException if an exception occurs whilst reading from the input stream.
343     */
344    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
345        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
346    }
347
348    static ByteBuffer emptyByteBuffer() {
349        return ByteBuffer.allocate(0);
350    }
351
352    /**
353     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
354     * will kick in.)
355     * @param contentType e.g. "text/html; charset=EUC-JP"
356     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
357     */
358    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
359        if (contentType == null) return null;
360        Matcher m = charsetPattern.matcher(contentType);
361        if (m.find()) {
362            String charset = m.group(1).trim();
363            charset = charset.replace("charset=", "");
364            return validateCharset(charset);
365        }
366        return null;
367    }
368
369    private @Nullable static String validateCharset(@Nullable String cs) {
370        if (cs == null || cs.length() == 0) return null;
371        cs = cs.trim().replaceAll("[\"']", "");
372        try {
373            if (Charset.isSupported(cs)) return cs;
374            cs = cs.toUpperCase(Locale.ENGLISH);
375            if (Charset.isSupported(cs)) return cs;
376        } catch (IllegalCharsetNameException e) {
377            // if all this charset matching fails.... we just take the default
378        }
379        return null;
380    }
381
382    /**
383     * Creates a random string, suitable for use as a mime boundary
384     */
385    static String mimeBoundary() {
386        final StringBuilder mime = StringUtil.borrowBuilder();
387        final Random rand = new Random();
388        for (int i = 0; i < boundaryLength; i++) {
389            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
390        }
391        return StringUtil.releaseBuilder(mime);
392    }
393
394    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
395        byte[] bom = new byte[4];
396        input.mark(bom.length);
397        //noinspection ResultOfMethodCallIgnored
398        input.read(bom, 0, 4);
399        input.reset();
400
401        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
402        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
403            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
404            return "UTF-32"; // and I hope it's on your system
405        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
406            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
407            return "UTF-16"; // in all Javas
408        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
409            input.read(bom, 0, 3); // consume the UTF-8 BOM
410            return "UTF-8"; // in all Javas
411        }
412        return null;
413    }
414}