tika-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (TIKA-2584) Tika should have a way to pass arbitrary Tesseract options
Date Thu, 29 Mar 2018 18:59:00 GMT

    [ https://issues.apache.org/jira/browse/TIKA-2584?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16419581#comment-16419581
] 

ASF GitHub Bot commented on TIKA-2584:
--------------------------------------

tballison closed pull request #224: Fix for TIKA-2584 contributed by ewanmellor.
URL: https://github.com/apache/tika/pull/224
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index c8c8bc93e..6ee050156 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -20,7 +20,9 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
+import java.util.HashMap;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Properties;
 
 /**
@@ -91,12 +93,18 @@
     // factor by which image is to be scaled.
     private int resize = 900;
 
+    // See setPageSeparator.
+    private String pageSeparator = "";
+
     // whether or not to preserve interword spacing
     private boolean preserveInterwordSpacing = false;
 
     // whether or not to apply rotation calculated by the rotation.py script
     private boolean applyRotation = false;
 
+    // See addOtherTesseractConfig.
+    private Map<String, String> otherTesseractConfig = new HashMap<>();
+
 
     /**
      * Default contructor.
@@ -175,6 +183,7 @@ private void init(InputStream is) {
         setApplyRotation(
         		getProp(props, "applyRotation", getApplyRotation()));
 
+        loadOtherTesseractConfig(props);
     }
 
     /**
@@ -255,6 +264,25 @@ public void setPageSegMode(String pageSegMode) {
         this.pageSegMode = pageSegMode;
     }
 
+    /**
+     * @see #setPageSeparator(String pageSeparator)
+     */
+    public String getPageSeparator() {
+        return pageSeparator;
+    }
+
+    /**
+     * The page separator to use in plain text output.  This corresponds to Tesseract's page_separator
config option.
+     * The default here is the empty string (i.e. no page separators).  Note that this is
also the default in
+     * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character.
 We are overriding
+     * Tesseract 4.0's default here.
+     *
+     * @param pageSeparator
+     */
+    public void setPageSeparator(String pageSeparator) {
+        this.pageSeparator = pageSeparator;
+    }
+
     /**
      * Whether or not to maintain interword spacing.  Default is <code>false</code>.
      *
@@ -494,6 +522,28 @@ public void setApplyRotation(boolean applyRotation) {
     	this.applyRotation = applyRotation;
     }
 
+    /**
+     * @see #addOtherTesseractConfig(String, String)
+     */
+    public Map<String, String> getOtherTesseractConfig() {
+        return otherTesseractConfig;
+    }
+
+    /**
+     * Add a key-value pair to pass to Tesseract using its -c command line option.
+     * To see the possible options, run tesseract --print-parameters.
+     *
+     * You may also add these parameters in TesseractOCRConfig.properties; any
+     * key-value pair in the properties file where the key contains an underscore
+     * is passed directly to Tesseract.
+     *
+     * @param key
+     * @param value
+     */
+    public void addOtherTesseractConfig(String key, String value) {
+        otherTesseractConfig.put(key, value);
+    }
+
     /**
      * Get property from the properties file passed in.
      *
@@ -543,4 +593,18 @@ private boolean getProp(Properties properties, String property, boolean
defaultM
                 property, propVal));
     }
 
+    /**
+     * Populate otherTesseractConfig from the given properties.
+     * This assumes that any key-value pair where the key contains
+     * an underscore is an option to be passed opaquely to Tesseract.
+     *
+     * @param properties properties file to read from.
+     */
+    private void loadOtherTesseractConfig(Properties properties) {
+        for (String k : properties.stringPropertyNames()) {
+            if (k.contains("_")) {
+                otherTesseractConfig.put(k, properties.getProperty(k));
+            }
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 08847fd74..6bf2ab492 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -34,6 +34,7 @@
 import java.nio.charset.Charset;
 import java.nio.file.Files;
 import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
@@ -465,11 +466,20 @@ public void checkInitialization(InitializableProblemHandler problemHandler)
      *           if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException,
TikaException {
-        String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(),
output.getPath(), "-l",
+        ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
+                config.getTesseractPath() + getTesseractProg(), input.getPath(),  output.getPath(),
"-l",
                 config.getLanguage(), "-psm", config.getPageSegMode(),
-                config.getOutputType().name().toLowerCase(Locale.US),
+                config.getOutputType().name().toLowerCase(Locale.US)
+        ));
+        for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet())
{
+            cmd.add("-c");
+            cmd.add(entry.getKey() + "=" + entry.getValue());
+        }
+        cmd.addAll(Arrays.asList(
+                "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
+                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"
+        ));
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
         final Process process = pb.start();


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Tika should have a way to pass arbitrary Tesseract options
> ----------------------------------------------------------
>
>                 Key: TIKA-2584
>                 URL: https://issues.apache.org/jira/browse/TIKA-2584
>             Project: Tika
>          Issue Type: Improvement
>          Components: parser
>    Affects Versions: 1.17
>            Reporter: Ewan Mellor
>            Priority: Minor
>
> Tesseract has a very large number of config options (use tesseract --print-parameters
to see them).  There is no mechanism for TesseractOCRParser / TesseractOCRConfig to pass
these to Tesseract, and so they cannot be controlled by user code.
> Tika should pass these through as opaque key-value pairs, so that user code can set them
as necessary.
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Mime
View raw message