/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.parser.ocr;

import java.awt.Image;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.config.TikaTaskTimeout;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractExternalProcessParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.ocr.ImagePreprocessor;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;

public class TesseractOCRParser
extends AbstractExternalProcessParser
implements Initializable {
    public static final String TESS_META = "tess:";
    public static final Property IMAGE_ROTATION = Property.externalRealSeq((String)"tess:rotation");
    public static final Property IMAGE_MAGICK = Property.externalBooleanSeq((String)"tess:image_magick_processed");
    private static final String TESSDATA_PREFIX = "TESSDATA_PREFIX";
    public static final Property PSM0_PAGE_NUMBER = Property.externalInteger((String)"tess:page_number");
    public static final Property PSM0_ORIENTATION = Property.externalInteger((String)"tess:orientation");
    public static final Property PSM0_ROTATE = Property.externalInteger((String)"tess:rotate");
    public static final Property PSM0_ORIENTATION_CONFIDENCE = Property.externalReal((String)"tess:orientation_confidence");
    public static final Property PSM0_SCRIPT = Property.externalText((String)"tess:script");
    public static final Property PSM0_SCRIPT_CONFIDENCE = Property.externalReal((String)"tess:script_confidence");
    private static final String OCR = "ocr-";
    private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRParser.class);
    private static final Object[] LOCK = new Object[0];
    private static final long serialVersionUID = -8167538283213097265L;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(MediaType.image((String)"ocr-png"), MediaType.image((String)"ocr-jpeg"), MediaType.image((String)"ocr-tiff"), MediaType.image((String)"ocr-bmp"), MediaType.image((String)"ocr-gif"), MediaType.image((String)"jp2"), MediaType.image((String)"jpx"), MediaType.image((String)"x-portable-pixmap"), MediaType.image((String)"ocr-jp2"), MediaType.image((String)"ocr-jpx"), MediaType.image((String)"ocr-x-portable-pixmap"))));
    private static volatile boolean HAS_WARNED = false;
    private final Set<String> langs = new HashSet<String>();
    private final TesseractOCRConfig defaultConfig = new TesseractOCRConfig();
    private String tesseractPath = "";
    private String tessdataPath = "";
    private String imageMagickPath = "";
    private boolean preloadLangs = false;
    private boolean hasTesseract;
    private boolean hasImageMagick;
    private ImagePreprocessor imagePreprocessor;

    public static String getImageMagickProg() {
        return System.getProperty("os.name").startsWith("Windows") ? "magick" : "convert";
    }

    public static String getTesseractProg() {
        return System.getProperty("os.name").startsWith("Windows") ? "tesseract.exe" : "tesseract";
    }

    public Set<MediaType> getSupportedTypes(ParseContext context) {
        TesseractOCRConfig config = (TesseractOCRConfig)context.get(TesseractOCRConfig.class);
        if (this.hasTesseract && (config == null || !config.isSkipOcr())) {
            return SUPPORTED_TYPES;
        }
        return Collections.emptySet();
    }

    private void setEnv(ProcessBuilder pb) {
        Map<String, String> env = pb.environment();
        if (!StringUtils.isBlank((String)this.getTessdataPath())) {
            env.put(TESSDATA_PREFIX, this.getTessdataPath());
        } else if (!StringUtils.isBlank((String)this.getTesseractPath())) {
            env.put(TESSDATA_PREFIX, this.getTesseractPath() + "tessdata");
        }
    }

    public boolean hasTesseract() throws TikaConfigException {
        String tesseract = this.getTesseractPath() + TesseractOCRParser.getTesseractProg();
        if (!StringUtils.isBlank((String)this.tesseractPath) && !Files.isDirectory(Paths.get(this.tesseractPath, new String[0]), new LinkOption[0])) {
            throw new TikaConfigException("tesseractPath (" + this.tesseractPath + ") doesn't point to an existing directory");
        }
        Object[] checkCmd = new String[]{tesseract};
        boolean hasTesseract = ExternalParser.check((String[])checkCmd, (int[])new int[0]);
        LOG.debug("hasTesseract (path: " + Arrays.toString(checkCmd) + "): " + hasTesseract);
        return hasTesseract;
    }

    boolean hasImageMagick() throws TikaConfigException {
        String fullImageMagickPath = this.imageMagickPath + TesseractOCRParser.getImageMagickProg();
        if (!StringUtils.isBlank((String)this.imageMagickPath) && !Files.isDirectory(Paths.get(this.imageMagickPath, new String[0]), new LinkOption[0])) {
            throw new TikaConfigException("imageMagickPath (" + this.imageMagickPath + ") doesn't point to an existing directory");
        }
        String[] checkCmd = new String[]{fullImageMagickPath};
        boolean hasImageMagick = ExternalParser.check((String[])checkCmd, (int[])new int[0]);
        if (!hasImageMagick) {
            LOG.debug("ImageMagick does not appear to be installed (commandline: " + fullImageMagickPath + ")");
        }
        return hasImageMagick;
    }

    public void parse(Image image, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
        try (TemporaryResources tmp = new TemporaryResources();){
            int w = image.getWidth(null);
            int h = image.getHeight(null);
            BufferedImage bImage = new BufferedImage(w, h, 1);
            File file = tmp.createTemporaryFile();
            try (FileOutputStream fos = new FileOutputStream(file);){
                ImageIO.write((RenderedImage)bImage, "png", fos);
            }
            try (TikaInputStream tis = TikaInputStream.get((File)file);){
                this.parse((InputStream)tis, handler, metadata, context);
            }
        }
    }

    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        TesseractOCRConfig userConfig = (TesseractOCRConfig)parseContext.get(TesseractOCRConfig.class);
        TesseractOCRConfig config = this.defaultConfig;
        if (userConfig != null) {
            config = this.defaultConfig.cloneAndUpdate(userConfig);
        }
        if (!this.hasTesseract || config != null && config.isSkipOcr()) {
            return;
        }
        try (TemporaryResources tmp = new TemporaryResources();){
            TikaInputStream tikaStream = TikaInputStream.get((InputStream)stream, (TemporaryResources)tmp, (Metadata)metadata);
            tikaStream.getPath();
            File tmpOCROutputFile = tmp.createTemporaryFile();
            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
            xhtml.startDocument();
            this.parse(tikaStream, tmpOCROutputFile, (ContentHandler)xhtml, metadata, parseContext, config);
            xhtml.endDocument();
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void parse(TikaInputStream tikaInputStream, File tmpOCROutputFile, ContentHandler xhtml, Metadata metadata, ParseContext parseContext, TesseractOCRConfig config) throws IOException, SAXException, TikaException {
        block22: {
            this.warnOnFirstParse();
            this.validateLangString(config.getLanguage());
            File tmpTxtOutput = null;
            try {
                Path input = tikaInputStream.getPath();
                long size = tikaInputStream.getLength();
                if (size < config.getMinFileSizeToOcr() || size > config.getMaxFileSizeToOcr()) break block22;
                if (config.isEnableImagePreprocessing() || config.isApplyRotation()) {
                    if (!this.hasImageMagick) {
                        LOG.warn("User has selected to preprocess images, but I can't find ImageMagick.Backing off to original file.");
                        this.doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
                    } else {
                        try (TemporaryResources tmp = new TemporaryResources();){
                            Path tmpFile = tmp.createTempFile();
                            Files.copy(input, tmpFile, StandardCopyOption.REPLACE_EXISTING);
                            this.imagePreprocessor.process(tmpFile, tmpFile, metadata, config);
                            this.doOCR(tmpFile.toFile(), tmpOCROutputFile, config, parseContext);
                        }
                    }
                } else {
                    this.doOCR(input.toFile(), tmpOCROutputFile, config, parseContext);
                }
                String extension = config.getPageSegMode().equals("0") ? "osd" : config.getOutputType().toString().toLowerCase(Locale.US);
                tmpTxtOutput = new File(tmpOCROutputFile.getAbsolutePath() + "." + extension);
                if (!tmpTxtOutput.exists()) break block22;
                try (FileInputStream is = new FileInputStream(tmpTxtOutput);){
                    if (config.getPageSegMode().equals("0")) {
                        this.extractOSD(is, metadata);
                    } else if (config.getOutputType().equals((Object)TesseractOCRConfig.OUTPUT_TYPE.HOCR)) {
                        this.extractHOCROutput(is, parseContext, xhtml);
                    } else {
                        this.extractOutput(is, xhtml);
                    }
                }
            }
            finally {
                if (tmpTxtOutput != null) {
                    tmpTxtOutput.delete();
                }
            }
        }
    }

    private void extractOSD(InputStream is, Metadata metadata) throws IOException {
        Matcher matcher = Pattern.compile("^([^:]+):\\s+(.*)").matcher("");
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));){
            String line = reader.readLine();
            while (line != null) {
                if (matcher.reset(line).find()) {
                    String k = matcher.group(1);
                    String v = matcher.group(2);
                    switch (k) {
                        case "Page number": {
                            metadata.set(PSM0_PAGE_NUMBER, Integer.parseInt(v));
                            break;
                        }
                        case "Orientation in degrees": {
                            metadata.set(PSM0_ORIENTATION, Integer.parseInt(v));
                            break;
                        }
                        case "Rotate": {
                            metadata.set(PSM0_ROTATE, Integer.parseInt(v));
                            break;
                        }
                        case "Orientation confidence": {
                            metadata.set(PSM0_ORIENTATION_CONFIDENCE, Double.parseDouble(v));
                            break;
                        }
                        case "Script": {
                            metadata.set(PSM0_SCRIPT, v);
                            break;
                        }
                        case "Script confidence": {
                            metadata.set(PSM0_SCRIPT_CONFIDENCE, Double.parseDouble(v));
                            break;
                        }
                        default: {
                            LOG.warn("I regret I don't know how to parse {} with value {}", (Object)k, (Object)v);
                        }
                    }
                }
                line = reader.readLine();
            }
        }
    }

    private void warnOnFirstParse() {
        if (!this.hasWarned()) {
            this.warn();
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void doOCR(File input, File output, TesseractOCRConfig config, ParseContext parseContext) throws IOException, TikaException {
        ArrayList<String> cmd = new ArrayList<String>(Arrays.asList(this.getTesseractPath() + TesseractOCRParser.getTesseractProg(), input.getPath(), output.getPath(), "--psm", config.getPageSegMode()));
        if (!"0".equals(config.getPageSegMode())) {
            if (!StringUtils.isBlank((String)config.getLanguage())) {
                cmd.add("-l");
                cmd.add(config.getLanguage());
            }
            for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
                cmd.add("-c");
                cmd.add(entry.getKey() + "=" + entry.getValue());
            }
            cmd.addAll(Arrays.asList("-c", "page_separator=" + config.getPageSeparator(), "-c", config.isPreserveInterwordSpacing() ? "preserve_interword_spaces=1" : "preserve_interword_spaces=0", config.getOutputType().name().toLowerCase(Locale.US)));
        }
        LOG.debug("Tesseract command: " + String.join((CharSequence)" ", cmd));
        ProcessBuilder pb = new ProcessBuilder(cmd);
        this.setEnv(pb);
        Process process = null;
        String id = null;
        long timeoutMillis = TikaTaskTimeout.getTimeoutMillis((ParseContext)parseContext, (long)(config.getTimeoutSeconds() * 1000));
        try {
            process = pb.start();
            id = this.register(process);
            this.runOCRProcess(process, timeoutMillis);
        }
        finally {
            if (process != null) {
                process.destroyForcibly();
            }
            if (id != null) {
                this.release(id);
            }
        }
    }

    private void runOCRProcess(Process process, long timeoutMillis) throws IOException, TikaException {
        process.getOutputStream().close();
        InputStream out = process.getInputStream();
        InputStream err = process.getErrorStream();
        StringBuilder outBuilder = new StringBuilder();
        StringBuilder errBuilder = new StringBuilder();
        Thread outThread = this.logStream(out, outBuilder);
        Thread errThread = this.logStream(err, errBuilder);
        outThread.start();
        errThread.start();
        int exitValue = Integer.MIN_VALUE;
        try {
            boolean finished = process.waitFor(timeoutMillis, TimeUnit.MILLISECONDS);
            if (!finished) {
                throw new TikaException("TesseractOCRParser timeout");
            }
            exitValue = process.exitValue();
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new TikaException("TesseractOCRParser interrupted", (Throwable)e);
        }
        catch (IllegalThreadStateException e) {
            throw new TikaException("TesseractOCRParser timeout");
        }
        if (exitValue > 0) {
            try {
                errThread.join(1000L);
            }
            catch (InterruptedException interruptedException) {
                // empty catch block
            }
            throw new TikaException("TesseractOCRParser bad exit value " + exitValue + " err msg: " + errBuilder.toString());
        }
    }

    private void extractOutput(InputStream stream, ContentHandler xhtml) throws SAXException, IOException {
        AttributesImpl attrs = new AttributesImpl();
        attrs.addAttribute("", "class", "class", "CDATA", "ocr");
        xhtml.startElement("http://www.w3.org/1999/xhtml", "div", "div", attrs);
        try (InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);){
            char[] buffer = new char[1024];
            int n = reader.read(buffer);
            while (n != -1) {
                if (n > 0) {
                    xhtml.characters(buffer, 0, n);
                }
                n = reader.read(buffer);
            }
        }
        xhtml.endElement("http://www.w3.org/1999/xhtml", "div", "div");
    }

    private void extractHOCROutput(InputStream is, ParseContext parseContext, ContentHandler xhtml) throws TikaException, IOException, SAXException {
        if (parseContext == null) {
            parseContext = new ParseContext();
        }
        AttributesImpl attrs = new AttributesImpl();
        attrs.addAttribute("", "class", "class", "CDATA", "ocr");
        xhtml.startElement("http://www.w3.org/1999/xhtml", "div", "div", attrs);
        XMLReaderUtils.parseSAX((InputStream)is, (ContentHandler)new HOCRPassThroughHandler(xhtml), (ParseContext)parseContext);
        xhtml.endElement("http://www.w3.org/1999/xhtml", "div", "div");
    }

    private Thread logStream(InputStream stream, StringBuilder out) {
        return new Thread(() -> {
            InputStreamReader reader = new InputStreamReader(stream, StandardCharsets.UTF_8);
            char[] buffer = new char[1024];
            try {
                int n = reader.read(buffer);
                while (n != -1) {
                    out.append(buffer, 0, n);
                    n = reader.read(buffer);
                }
            }
            catch (IOException iOException) {
            }
            finally {
                IOUtils.closeQuietly((InputStream)stream);
            }
            LOG.debug("{}", (Object)out);
        });
    }

    public void initialize(Map<String, Param> params) throws TikaConfigException {
        this.hasTesseract = this.hasTesseract();
        this.hasImageMagick = this.hasImageMagick();
        if (this.preloadLangs) {
            this.preloadLangs();
            if (!StringUtils.isBlank((String)this.defaultConfig.getLanguage())) {
                this.validateLangString(this.defaultConfig.getLanguage());
            }
        }
        this.imagePreprocessor = new ImagePreprocessor(this.getImageMagickPath() + TesseractOCRParser.getImageMagickProg());
    }

    private void validateLangString(String language) throws TikaConfigException {
        HashSet<String> invalidlangs = new HashSet<String>();
        HashSet<String> validLangs = new HashSet<String>();
        TesseractOCRConfig.getLangs(language, validLangs, invalidlangs);
        if (invalidlangs.size() > 0) {
            throw new TikaConfigException("Invalid language code(s): " + invalidlangs);
        }
        if (this.langs.size() > 0) {
            for (String lang : validLangs) {
                if (this.langs.contains(lang)) continue;
                throw new TikaConfigException("tesseract does not have " + lang + " available. I see only: " + this.langs);
            }
        }
    }

    public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException {
        if (this.langs.size() > 0 && !StringUtils.isBlank((String)this.defaultConfig.getLanguage()) && !this.langs.contains(this.defaultConfig.getLanguage())) {
            throw new TikaConfigException("It doesn't look like tesseract has lang data for " + this.defaultConfig.getLanguage() + ". I see only: " + this.langs);
        }
    }

    public Set<String> getLangs() {
        return this.langs;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected boolean hasWarned() {
        if (HAS_WARNED) {
            return true;
        }
        Object[] objectArray = LOCK;
        synchronized (LOCK) {
            if (HAS_WARNED) {
                // ** MonitorExit[var1_1] (shouldn't be in output)
                return true;
            }
            // ** MonitorExit[var1_1] (shouldn't be in output)
            return false;
        }
    }

    protected void warn() {
        LOG.info("Tesseract is installed and is being invoked. This can add greatly to processing time.  If you do not want tesseract to be applied to your files see: https://cwiki.apache.org/confluence/display/TIKA/TikaOCR#TikaOCR-disable-ocr");
        HAS_WARNED = true;
    }

    public String getTesseractPath() {
        return this.tesseractPath;
    }

    @Field
    public void setTesseractPath(String tesseractPath) {
        if (!(tesseractPath = FilenameUtils.normalize((String)tesseractPath)).isEmpty() && !tesseractPath.endsWith(File.separator)) {
            tesseractPath = tesseractPath + File.separator;
        }
        this.tesseractPath = tesseractPath;
    }

    public String getTessdataPath() {
        return this.tessdataPath;
    }

    @Field
    public void setTessdataPath(String tessdataPath) {
        if (!(tessdataPath = FilenameUtils.normalize((String)tessdataPath)).isEmpty() && !tessdataPath.endsWith(File.separator)) {
            tessdataPath = tessdataPath + File.separator;
        }
        this.tessdataPath = tessdataPath;
    }

    public String getImageMagickPath() {
        return this.imageMagickPath;
    }

    @Field
    public void setImageMagickPath(String imageMagickPath) {
        if (!(imageMagickPath = FilenameUtils.normalize((String)imageMagickPath)).isEmpty() && !imageMagickPath.endsWith(File.separator)) {
            imageMagickPath = imageMagickPath + File.separator;
        }
        this.imageMagickPath = imageMagickPath;
    }

    @Field
    public void setOtherTesseractSettings(List<String> settings) throws TikaConfigException {
        for (String s : settings) {
            String[] bits = s.trim().split("\\s+");
            if (bits.length != 2) {
                throw new TikaConfigException("Expected space delimited key value pair. However, I found " + bits.length + " bits.");
            }
            this.defaultConfig.addOtherTesseractConfig(bits[0], bits[1]);
        }
    }

    public List<String> getOtherTesseractSettings() {
        ArrayList<String> settings = new ArrayList<String>();
        TreeMap<String, String> sorted = new TreeMap<String, String>(this.defaultConfig.getOtherTesseractConfig());
        for (Map.Entry e : sorted.entrySet()) {
            settings.add((String)e.getKey() + " " + (String)e.getValue());
        }
        return settings;
    }

    @Field
    public void setSkipOCR(boolean skipOCR) {
        this.defaultConfig.setSkipOcr(skipOCR);
    }

    public boolean isSkipOCR() {
        return this.defaultConfig.isSkipOcr();
    }

    @Field
    public void setLanguage(String language) {
        this.defaultConfig.setLanguage(language);
    }

    public String getLanguage() {
        return this.defaultConfig.getLanguage();
    }

    @Field
    public void setPageSegMode(String pageSegMode) {
        this.defaultConfig.setPageSegMode(pageSegMode);
    }

    public String getPageSegMode() {
        return this.defaultConfig.getPageSegMode();
    }

    @Field
    public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
        this.defaultConfig.setMaxFileSizeToOcr(maxFileSizeToOcr);
    }

    public long getMaxFileSizeToOcr() {
        return this.defaultConfig.getMaxFileSizeToOcr();
    }

    @Field
    public void setMinFileSizeToOcr(long minFileSizeToOcr) {
        this.defaultConfig.setMinFileSizeToOcr(minFileSizeToOcr);
    }

    public long getMinFileSizeToOcr() {
        return this.defaultConfig.getMinFileSizeToOcr();
    }

    @Field
    public void setTimeout(int timeout) {
        this.defaultConfig.setTimeoutSeconds(timeout);
    }

    public int getTimeout() {
        return this.defaultConfig.getTimeoutSeconds();
    }

    @Field
    public void setOutputType(String outputType) {
        this.defaultConfig.setOutputType(outputType);
    }

    public String getOutputType() {
        return this.defaultConfig.getOutputType().name();
    }

    @Field
    public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
        this.defaultConfig.setPreserveInterwordSpacing(preserveInterwordSpacing);
    }

    public boolean isPreserveInterwordSpacing() {
        return this.defaultConfig.isPreserveInterwordSpacing();
    }

    @Field
    public void setEnableImagePreprocessing(boolean enableImagePreprocessing) {
        this.defaultConfig.setEnableImagePreprocessing(enableImagePreprocessing);
    }

    public boolean isEnableImagePreprocessing() {
        return this.defaultConfig.isEnableImagePreprocessing();
    }

    @Field
    public void setDensity(int density) {
        this.defaultConfig.setDensity(density);
    }

    public int getDensity() {
        return this.defaultConfig.getDensity();
    }

    @Field
    public void setDepth(int depth) {
        this.defaultConfig.setDepth(depth);
    }

    public int getDepth() {
        return this.defaultConfig.getDepth();
    }

    @Field
    public void setColorspace(String colorspace) {
        this.defaultConfig.setColorspace(colorspace);
    }

    public String getColorspace() {
        return this.defaultConfig.getColorspace();
    }

    @Field
    public void setFilter(String filter) {
        this.defaultConfig.setFilter(filter);
    }

    public String getFilter() {
        return this.defaultConfig.getFilter();
    }

    @Field
    public void setResize(int resize) {
        this.defaultConfig.setResize(resize);
    }

    public int getResize() {
        return this.defaultConfig.getResize();
    }

    @Field
    public void setApplyRotation(boolean applyRotation) {
        this.defaultConfig.setApplyRotation(applyRotation);
    }

    public boolean isApplyRotation() {
        return this.defaultConfig.isApplyRotation();
    }

    @Field
    public void setPreloadLangs(boolean preloadLangs) {
        this.preloadLangs = preloadLangs;
    }

    public boolean isPreloadLangs() {
        return this.preloadLangs;
    }

    public TesseractOCRConfig getDefaultConfig() {
        return this.defaultConfig;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void preloadLangs() {
        String[] args = new String[]{this.getTesseractPath() + TesseractOCRParser.getTesseractProg(), "--list-langs"};
        ProcessBuilder pb = new ProcessBuilder(args);
        this.setEnv(pb);
        Process process = null;
        try {
            process = pb.start();
            this.getLangs(process, this.defaultConfig.getTimeoutSeconds());
        }
        catch (IOException | TikaException e) {
            LOG.warn("Problem preloading langs", e);
        }
        finally {
            if (process != null) {
                process.destroyForcibly();
            }
        }
    }

    private void getLangs(Process process, int timeoutSeconds) throws IOException, TikaException {
        process.getOutputStream().close();
        InputStream out = process.getInputStream();
        InputStream err = process.getErrorStream();
        StringBuilder outBuilder = new StringBuilder();
        StringBuilder errBuilder = new StringBuilder();
        Thread outThread = this.logStream(out, outBuilder);
        Thread errThread = this.logStream(err, errBuilder);
        outThread.start();
        errThread.start();
        int exitValue = Integer.MIN_VALUE;
        try {
            boolean finished = process.waitFor(timeoutSeconds, TimeUnit.SECONDS);
            if (!finished) {
                throw new TikaException("TesseractOCRParser timeout");
            }
            exitValue = process.exitValue();
            outThread.join(1000L);
        }
        catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new TikaException("TesseractOCRParser interrupted", (Throwable)e);
        }
        catch (IllegalThreadStateException e) {
            throw new TikaException("TesseractOCRParser timeout");
        }
        if (exitValue > 0) {
            throw new TikaException("TesseractOCRParser bad exit value " + exitValue + " err msg: " + errBuilder.toString());
        }
        for (String line : outBuilder.toString().split("[\r\n]+")) {
            if (line.startsWith("List of available")) continue;
            this.langs.add(line.trim());
        }
    }

    private static class HOCRPassThroughHandler
    extends DefaultHandler {
        public static final Set<String> IGNORE = HOCRPassThroughHandler.unmodifiableSet("html", "head", "title", "meta", "body");
        private final ContentHandler xhtml;

        public HOCRPassThroughHandler(ContentHandler xhtml) {
            this.xhtml = xhtml;
        }

        private static Set<String> unmodifiableSet(String ... elements) {
            return Collections.unmodifiableSet(new HashSet<String>(Arrays.asList(elements)));
        }

        @Override
        public void startElement(String uri, String local, String name, Attributes attributes) throws SAXException {
            if (!IGNORE.contains(name)) {
                this.xhtml.startElement(uri, local, name, attributes);
            }
        }

        @Override
        public void endElement(String uri, String local, String name) throws SAXException {
            if (!IGNORE.contains(name)) {
                this.xhtml.endElement(uri, local, name);
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
            this.xhtml.characters(ch, start, length);
        }
    }
}

