james-server-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From btell...@apache.org
Subject [1/9] james-project git commit: JAMES-2013 Tika version 1.15 might return several objects
Date Thu, 08 Jun 2017 02:46:38 GMT
Repository: james-project
Updated Branches:
  refs/heads/master ae1eac761 -> 255e6cd30


JAMES-2013 Tika version 1.15 might return several objects

Only the first one is pertinent


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/255e6cd3
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/255e6cd3
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/255e6cd3

Branch: refs/heads/master
Commit: 255e6cd3040c157065361276ad68514db51a6078
Parents: 2a6bc08
Author: benwa <btellier@linagora.com>
Authored: Wed Jun 7 18:28:05 2017 +0700
Committer: benwa <btellier@linagora.com>
Committed: Thu Jun 8 09:46:25 2017 +0700

----------------------------------------------------------------------
 .../james/mailbox/tika/TikaTextExtractor.java   |  2 +-
 .../mailbox/tika/TikaTextExtractorTest.java     | 40 ++++++++++----------
 2 files changed, 22 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/255e6cd3/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
index 3b290a0..e38b2a8 100644
--- a/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
+++ b/mailbox/tika/src/main/java/org/apache/james/mailbox/tika/TikaTextExtractor.java
@@ -85,7 +85,7 @@ public class TikaTextExtractor implements TextExtractor {
         @Override
         public ContentAndMetadata deserialize(JsonParser jsonParser, DeserializationContext
deserializationContext) throws IOException, JsonProcessingException {
             TreeNode treeNode = jsonParser.getCodec().readTree(jsonParser);
-            Preconditions.checkState(treeNode.isArray() && treeNode.size() == 1,
"The response should have only one element");
+            Preconditions.checkState(treeNode.isArray() && treeNode.size() >=
1, "The response should be an array with at least one element");
             Preconditions.checkState(treeNode.get(0).isObject(), "The element should be a
Json object");
             ObjectNode node = (ObjectNode) treeNode.get(0);
             return ContentAndMetadata.from(ImmutableList.copyOf(node.fields())

http://git-wip-us.apache.org/repos/asf/james-project/blob/255e6cd3/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
index a957dbe..7d70444 100644
--- a/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
+++ b/mailbox/tika/src/test/java/org/apache/james/mailbox/tika/TikaTextExtractorTest.java
@@ -28,6 +28,7 @@ import java.io.InputStream;
 import java.util.List;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.james.mailbox.extractor.ParsedContent;
 import org.apache.james.mailbox.extractor.TextExtractor;
 import org.apache.james.mailbox.tika.TikaTextExtractor.ContentAndMetadataDeserializer;
 import org.junit.Before;
@@ -111,7 +112,7 @@ public class TikaTextExtractorTest {
         InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.odp");
         assertThat(inputStream).isNotNull();
         assertThat(textExtractor.extractContent(inputStream, "application/vnd.oasis.opendocument.presentation").getTextualContent())
-            .isEqualTo("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
+            .isEqualTo("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
     }
     
     @Test
@@ -153,34 +154,35 @@ public class TikaTextExtractorTest {
     }
 
     @Test
-    public void deserializerShouldThrowWhenMoreThanOneNode() throws Exception {
-        expectedException.expect(IllegalStateException.class);
-        expectedException.expectMessage("The response should have only one element");
-
-        TikaTextExtractor textExtractor = new TikaTextExtractor(new TikaHttpClient() {
-            
-            @Override
-            public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType)
throws TikaException {
-                return new ByteArrayInputStream("[{\"key1\":\"value1\"},{\"key2\":\"value2\"}]".getBytes(Charsets.UTF_8));
-            }
-        });
+    public void deserializerShouldNotThrowWhenMoreThanOneNode() throws Exception {
+        TikaTextExtractor textExtractor = new TikaTextExtractor(
+            (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\":
\"This is an awesome LibreOffice document !\"}, " +
+                "{\"Chroma BlackIsZero\": \"true\"}]").getBytes(Charsets.UTF_8)));
 
         InputStream inputStream = null;
         textExtractor.extractContent(inputStream, "text/plain");
     }
 
     @Test
+    public void deserializerShouldTakeFirstNodeWhenSeveral() throws Exception {
+        String expectedExtractedContent = "content A";
+        TikaTextExtractor textExtractor = new TikaTextExtractor(
+            (inputStream, contentType) -> new ByteArrayInputStream(("[{\"X-TIKA:content\":
\"" + expectedExtractedContent + "\"}, " +
+                "{\"X-TIKA:content\": \"content B\"}]").getBytes(Charsets.UTF_8)));
+
+        InputStream inputStream = null;
+        ParsedContent parsedContent = textExtractor.extractContent(inputStream, "text/plain");
+
+        assertThat(parsedContent.getTextualContent()).isEqualTo(expectedExtractedContent);
+    }
+
+    @Test
     public void deserializerShouldThrowWhenNodeIsNotAnObject() throws Exception {
         expectedException.expect(IllegalStateException.class);
         expectedException.expectMessage("The element should be a Json object");
 
-        TikaTextExtractor textExtractor = new TikaTextExtractor(new TikaHttpClient() {
-            
-            @Override
-            public InputStream recursiveMetaDataAsJson(InputStream inputStream, String contentType)
throws TikaException {
-                return new ByteArrayInputStream("[\"value1\"]".getBytes(Charsets.UTF_8));
-            }
-        });
+        TikaTextExtractor textExtractor = new TikaTextExtractor(
+            (inputStream, contentType) -> new ByteArrayInputStream("[\"value1\"]".getBytes(Charsets.UTF_8)));
 
         InputStream inputStream = null;
         textExtractor.extractContent(inputStream, "text/plain");


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org


Mime
View raw message