001package org.jsoup.helper;
002
003import org.jsoup.Connection;
004import org.jsoup.internal.ControllableInputStream;
005import org.jsoup.internal.Normalizer;
006import org.jsoup.internal.StringUtil;
007import org.jsoup.nodes.Comment;
008import org.jsoup.nodes.Document;
009import org.jsoup.nodes.Element;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.XmlDeclaration;
012import org.jsoup.parser.Parser;
013import org.jsoup.parser.StreamParser;
014import org.jsoup.select.Elements;
015import org.jspecify.annotations.Nullable;
016
017import java.io.BufferedReader;
018import java.io.File;
019import java.io.IOException;
020import java.io.InputStream;
021import java.io.InputStreamReader;
022import java.io.OutputStream;
023import java.io.Reader;
024import java.io.UncheckedIOException;
025import java.nio.ByteBuffer;
026import java.nio.channels.Channels;
027import java.nio.channels.SeekableByteChannel;
028import java.nio.charset.Charset;
029import java.nio.charset.IllegalCharsetNameException;
030import java.nio.file.Files;
031import java.nio.file.Path;
032import java.util.Locale;
033import java.util.Random;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036import java.util.zip.GZIPInputStream;
037
038import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
039
040/**
041 * Internal static utilities for handling data.
042 *
043 */
044@SuppressWarnings("CharsetObjectCanBeUsed")
045public final class DataUtil {
046    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
047    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
048    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
049    private static final int firstReadBufferSize = 1024 * 5;
050    private static final char[] mimeBoundaryChars =
051            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
052    static final int boundaryLength = 32;
053
054    private DataUtil() {}
055
056    /**
057     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
058     * are supported in addition to uncompressed files.
059     *
060     * @param file file to load
061     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
062     *     the file will always override this setting.
063     * @param baseUri base URI of document, to resolve relative links against
064     * @return Document
065     * @throws IOException on IO error
066     */
067    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
068        return load(file.toPath(), charsetName, baseUri);
069    }
070
071    /**
072     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
073     * are supported in addition to uncompressed files.
074     *
075     * @param file file to load
076     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
077     *     the file will always override this setting.
078     * @param baseUri base URI of document, to resolve relative links against
079     * @param parser alternate {@link Parser#xmlParser() parser} to use.
080
081     * @return Document
082     * @throws IOException on IO error
083     * @since 1.14.2
084     */
085    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
086        return load(file.toPath(), charsetName, baseUri, parser);
087    }
088
089    /**
090     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
091     * are supported in addition to uncompressed files.
092     *
093     * @param path file to load
094     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
095     *     the file will always override this setting.
096     * @param baseUri base URI of document, to resolve relative links against
097     * @return Document
098     * @throws IOException on IO error
099     */
100    public static Document load(Path path, @Nullable String charsetName, String baseUri) throws IOException {
101        return load(path, charsetName, baseUri, Parser.htmlParser());
102    }
103
104    /**
105     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
106     * are supported in addition to uncompressed files.
107     *
108     * @param path file to load
109     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
110     * the file will always override this setting.
111     * @param baseUri base URI of document, to resolve relative links against
112     * @param parser alternate {@link Parser#xmlParser() parser} to use.
113
114     * @return Document
115     * @throws IOException on IO error
116     * @since 1.17.2
117     */
118    public static Document load(Path path, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
119        return parseInputStream(openStream(path), charsetName, baseUri, parser);
120    }
121
122    /**
123     * Returns a {@link StreamParser} that will parse the supplied file progressively.
124     * Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
125     * are supported in addition to uncompressed files.
126     *
127     * @param path file to load
128     * @param charset (optional) character set of input; specify {@code null} to attempt to autodetect from metadata.
129     * A BOM in the file will always override this setting.
130     * @param baseUri base URI of document, to resolve relative links against
131     * @param parser underlying HTML or XML parser to use.
132
133     * @return Document
134     * @throws IOException on IO error
135     * @since 1.18.2
136     * @see Connection.Response#streamParser()
137     */
138    public static StreamParser streamParser(Path path, @Nullable Charset charset, String baseUri, Parser parser) throws IOException {
139        StreamParser streamer = new StreamParser(parser);
140        String charsetName = charset != null? charset.name() : null;
141        DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(openStream(path), charsetName, baseUri, parser);
142        BufferedReader reader = new BufferedReader(new InputStreamReader(charsetDoc.input, charsetDoc.charset), DefaultBufferSize);
143        streamer.parse(reader, baseUri); // initializes the parse and the document, but does not step() it
144
145        return streamer;
146    }
147
148    /** Open an input stream from a file; if it's a gzip file, returns a GZIPInputStream to unzip it. */
149    private static ControllableInputStream openStream(Path path) throws IOException {
150        final SeekableByteChannel byteChannel = Files.newByteChannel(path);
151        InputStream stream = Channels.newInputStream(byteChannel);
152        String name = Normalizer.lowerCase(path.getFileName().toString());
153        if (name.endsWith(".gz") || name.endsWith(".z")) {
154            final boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
155            byteChannel.position(0); // reset to start of file
156            if (zipped) {
157                stream = new GZIPInputStream(stream);
158            }
159        }
160        return ControllableInputStream.wrap(stream, 0);
161    }
162
163    /**
164     * Parses a Document from an input steam.
165     * @param in input stream to parse. The stream will be closed after reading.
166     * @param charsetName character set of input (optional)
167     * @param baseUri base URI of document, to resolve relative links against
168     * @return Document
169     * @throws IOException on IO error
170     */
171    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
172        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, Parser.htmlParser());
173    }
174
175    /**
176     * Parses a Document from an input steam, using the provided Parser.
177     * @param in input stream to parse. The stream will be closed after reading.
178     * @param charsetName character set of input (optional)
179     * @param baseUri base URI of document, to resolve relative links against
180     * @param parser alternate {@link Parser#xmlParser() parser} to use.
181     * @return Document
182     * @throws IOException on IO error
183     */
184    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
185        return parseInputStream(ControllableInputStream.wrap(in, 0), charsetName, baseUri, parser);
186    }
187
188    /**
189     * Writes the input stream to the output stream. Doesn't close them.
190     * @param in input stream to read from
191     * @param out output stream to write to
192     * @throws IOException on IO error
193     */
194    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
195        final byte[] buffer = new byte[DefaultBufferSize];
196        int len;
197        while ((len = in.read(buffer)) != -1) {
198            out.write(buffer, 0, len);
199        }
200    }
201
202    /** A struct to return a detected charset, and a document (if fully read). */
203    static class CharsetDoc {
204        Charset charset;
205        InputStream input;
206        @Nullable Document doc;
207
208        CharsetDoc(Charset charset, @Nullable Document doc, InputStream input) {
209            this.charset = charset;
210            this.input = input;
211            this.doc = doc;
212        }
213    }
214
215    static Document parseInputStream(@Nullable ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
216        if (input == null) // empty body // todo reconsider?
217            return new Document(baseUri);
218
219        final Document doc;
220        CharsetDoc charsetDoc = null;
221        try {
222            charsetDoc = detectCharset(input, charsetName, baseUri, parser);
223            doc = parseInputStream(charsetDoc, baseUri, parser);
224        } finally {
225            if (charsetDoc != null)
226                charsetDoc.input.close();
227        }
228        return doc;
229    }
230
231    static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
232        Document doc = null;
233        // read the start of the stream and look for a BOM or meta charset:
234        // look for BOM - overrides any other header or input
235        String bomCharset = detectCharsetFromBom(input); // resets / consumes appropriately
236        if (bomCharset != null)
237            charsetName = bomCharset;
238
239        if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
240            int origMax = input.max();
241            input.max(firstReadBufferSize);
242            input.mark(firstReadBufferSize);
243            input.allowClose(false); // ignores closes during parse, in case we need to rewind
244            try {
245                Reader reader = new InputStreamReader(input, UTF_8); // input is currently capped to firstReadBufferSize
246                doc = parser.parseInput(reader, baseUri);
247                input.reset();
248                input.max(origMax); // reset for a full read if required
249            } catch (UncheckedIOException e) {
250                throw e.getCause();
251            } finally {
252                input.allowClose(true);
253            }
254
255            // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
256            Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
257            String foundCharset = null; // if not found, will keep utf-8 as best attempt
258            for (Element meta : metaElements) {
259                if (meta.hasAttr("http-equiv"))
260                    foundCharset = getCharsetFromContentType(meta.attr("content"));
261                if (foundCharset == null && meta.hasAttr("charset"))
262                    foundCharset = meta.attr("charset");
263                if (foundCharset != null)
264                    break;
265            }
266
267            // look for <?xml encoding='ISO-8859-1'?>
268            if (foundCharset == null && doc.childNodeSize() > 0) {
269                Node first = doc.childNode(0);
270                XmlDeclaration decl = null;
271                if (first instanceof XmlDeclaration)
272                    decl = (XmlDeclaration) first;
273                else if (first instanceof Comment) {
274                    Comment comment = (Comment) first;
275                    if (comment.isXmlDeclaration())
276                        decl = comment.asXmlDeclaration();
277                }
278                if (decl != null && decl.name().equalsIgnoreCase("xml")) {
279                    foundCharset = decl.attr("encoding");
280                }
281            }
282            foundCharset = validateCharset(foundCharset);
283            if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case-insensitive check here to match how validate works)
284                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
285                charsetName = foundCharset;
286                doc = null;
287            } else if (input.baseReadFully()) { // if we have read fully, and the charset was correct, keep that current parse
288                input.close(); // the parser tried to close it
289            } else {
290                doc = null;
291            }
292        } else { // specified by content type header (or by user on file load)
293            Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
294        }
295
296        // finally: prepare the return struct
297        if (charsetName == null)
298            charsetName = defaultCharsetName;
299        Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
300        return new CharsetDoc(charset, doc, input);
301    }
302
303    static Document parseInputStream(CharsetDoc charsetDoc, String baseUri, Parser parser) throws IOException {
304        // if doc != null it was fully parsed during charset detection; so just return that
305        if (charsetDoc.doc != null)
306            return charsetDoc.doc;
307
308        final InputStream input = charsetDoc.input;
309        Validate.notNull(input);
310        final Document doc;
311        final Charset charset = charsetDoc.charset;
312        try (Reader reader = new InputStreamReader(input, charset)) {
313            try {
314                doc = parser.parseInput(reader, baseUri);
315            } catch (UncheckedIOException e) {
316                // io exception when parsing (not seen before because reading the stream as we go)
317                throw e.getCause();
318            }
319            doc.outputSettings().charset(charset);
320            if (!charset.canEncode()) {
321                // some charsets can read but not encode; switch to an encodable charset and update the meta el
322                doc.charset(UTF_8);
323            }
324        }
325        return doc;
326    }
327
328    /**
329     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
330     * method is executing on. The data read until being interrupted will be available.
331     * @param inStream the input stream to read from
332     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
333     * @return the filled byte buffer
334     * @throws IOException if an exception occurs whilst reading from the input stream.
335     */
336    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
337        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
338    }
339
340    static ByteBuffer emptyByteBuffer() {
341        return ByteBuffer.allocate(0);
342    }
343
344    /**
345     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
346     * will kick in.)
347     * @param contentType e.g. "text/html; charset=EUC-JP"
348     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
349     */
350    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
351        if (contentType == null) return null;
352        Matcher m = charsetPattern.matcher(contentType);
353        if (m.find()) {
354            String charset = m.group(1).trim();
355            charset = charset.replace("charset=", "");
356            return validateCharset(charset);
357        }
358        return null;
359    }
360
361    private @Nullable static String validateCharset(@Nullable String cs) {
362        if (cs == null || cs.length() == 0) return null;
363        cs = cs.trim().replaceAll("[\"']", "");
364        try {
365            if (Charset.isSupported(cs)) return cs;
366            cs = cs.toUpperCase(Locale.ENGLISH);
367            if (Charset.isSupported(cs)) return cs;
368        } catch (IllegalCharsetNameException e) {
369            // if all this charset matching fails.... we just take the default
370        }
371        return null;
372    }
373
374    /**
375     * Creates a random string, suitable for use as a mime boundary
376     */
377    static String mimeBoundary() {
378        final StringBuilder mime = StringUtil.borrowBuilder();
379        final Random rand = new Random();
380        for (int i = 0; i < boundaryLength; i++) {
381            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
382        }
383        return StringUtil.releaseBuilder(mime);
384    }
385
386    private static @Nullable String detectCharsetFromBom(ControllableInputStream input) throws IOException {
387        byte[] bom = new byte[4];
388        input.mark(bom.length);
389        //noinspection ResultOfMethodCallIgnored
390        input.read(bom, 0, 4);
391        input.reset();
392
393        // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
394        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
395            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
396            return "UTF-32"; // and I hope it's on your system
397        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
398            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
399            return "UTF-16"; // in all Javas
400        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
401            input.read(bom, 0, 3); // consume the UTF-8 BOM
402            return "UTF-8"; // in all Javas
403        }
404        return null;
405    }
406}