james-server-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From btell...@apache.org
Subject svn commit: r1688145 - in /james/mailbox/trunk/elasticsearch: ./ src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/ src/test/resources/documents/
Date Mon, 29 Jun 2015 08:45:02 GMT
Author: btellier
Date: Mon Jun 29 08:45:01 2015
New Revision: 1688145

URL: http://svn.apache.org/r1688145
Log:
MAILBOX-245 Adding text extractors

Added:
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractor.java
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ParsedContent.java
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TextExtractor.java
    james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractor.java
    james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/
    james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractorTest.java
    james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractorTest.java
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/PDF.pdf
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.ods
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.xlsx
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/fake.txt
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.odp
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.pptx
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.docx
    james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.odt
Modified:
    james/mailbox/trunk/elasticsearch/pom.xml

Modified: james/mailbox/trunk/elasticsearch/pom.xml
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/pom.xml?rev=1688145&r1=1688144&r2=1688145&view=diff
==============================================================================
--- james/mailbox/trunk/elasticsearch/pom.xml (original)
+++ james/mailbox/trunk/elasticsearch/pom.xml Mon Jun 29 08:45:01 2015
@@ -103,6 +103,16 @@
             <scope>test</scope>
         </dependency>
         <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>1.7</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers</artifactId>
+            <version>1.7</version>
+        </dependency>
+        <dependency>
             <groupId>org.assertj</groupId>
             <artifactId>assertj-core</artifactId>
             <version>3.0.0</version>

Added: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractor.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractor.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractor.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,44 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.io.InputStream;
+import java.util.Optional;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableMultimap;
+import org.apache.commons.io.IOUtils;
+
+/**
+ * A default text extractor that is directly based on the input file provided.
+ * 
+ * Costs less calculations that TikaTextExtractor, but result is not that good.
+ */
+public class DefaultTextExtractor implements TextExtractor {
+
+    @Override
+    public ParsedContent extractContent(InputStream inputStream, Optional<String> contentType,
Optional<String> fileName) throws Exception {
+        if(contentType.isPresent() && contentType.get().startsWith("text/") ) {
+            return new ParsedContent(Optional.of(IOUtils.toString(inputStream)), ImmutableMultimap.copyOf(ArrayListMultimap.create()));
+        } else {
+            return new ParsedContent(Optional.empty(), ImmutableMultimap.of());
+        }
+    }
+}

Added: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ParsedContent.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ParsedContent.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ParsedContent.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/ParsedContent.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,58 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.util.Objects;
+import java.util.Optional;
+
+import com.google.common.collect.ImmutableMultimap;
+import com.google.common.collect.Multimap;
+
+public class ParsedContent {
+
+    private final Optional<String> textualContent;
+    private final ImmutableMultimap<String, String> metadata;
+
+    public ParsedContent(Optional<String> textualContent, Multimap<String, String>
metadata) {
+        this.textualContent = textualContent;
+        this.metadata = ImmutableMultimap.copyOf(metadata);
+    }
+
+    public Optional<String> getTextualContent() {
+        return textualContent;
+    }
+
+    public  Multimap<String, String> getMetadata() {
+        return metadata;
+    }
+
+    @Override public boolean equals(Object o) {
+        if (o instanceof ParsedContent) {
+            ParsedContent other = (ParsedContent) o;
+            return Objects.equals(textualContent, other.textualContent)
+                && Objects.equals(metadata, other.metadata);
+        }
+        return false;
+    }
+
+    @Override public int hashCode() {
+        return Objects.hash(textualContent, metadata);
+    }
+}

Added: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TextExtractor.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TextExtractor.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TextExtractor.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TextExtractor.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,29 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.io.InputStream;
+import java.util.Optional;
+
+public interface TextExtractor {
+
+    ParsedContent extractContent(InputStream inputStream, Optional<String> contentType,
Optional<String> fileName) throws Exception;
+
+}

Added: james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractor.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractor.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractor.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/main/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractor.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,88 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Optional;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Multimap;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+
+public class TikaTextExtractor implements TextExtractor {
+
+    private static class MetadataEntry {
+
+        private final String name;
+        private final ImmutableList<String> entries;
+
+        public MetadataEntry(String name, List<String> entries) {
+            this.name = name;
+            this.entries = ImmutableList.copyOf(entries);
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public List<String> getEntries() {
+            return entries;
+        }
+    }
+
+    private final Parser parser;
+    
+    public TikaTextExtractor() {
+        parser = new AutoDetectParser();
+    }
+
+    public ParsedContent extractContent(InputStream inputStream, Optional<String> contentType,
Optional<String> fileName) throws Exception {
+        Metadata metadata = new Metadata();
+        fileName.ifPresent(x -> metadata.set(Metadata.RESOURCE_NAME_KEY, x));
+        contentType.ifPresent(x -> metadata.set(Metadata.CONTENT_TYPE, x));
+
+        StringWriter stringWriter = new StringWriter();
+        BodyContentHandler bodyContentHandler = new BodyContentHandler(stringWriter);
+        parser.parse(inputStream, bodyContentHandler, metadata, new ParseContext());
+
+        return new ParsedContent(Optional.of(stringWriter.toString()), convertMetadataToMultimap(metadata));
+    }
+
+    private Multimap<String, String> convertMetadataToMultimap(Metadata metadata) {
+        return Arrays.stream(metadata.names())
+            .map(name -> new MetadataEntry(name, Arrays.asList(metadata.getValues(name))))
+            .reduce(ArrayListMultimap.create(), (metadataMultiMap, metadataEntry) -> {
+                    metadataMultiMap.putAll(metadataEntry.getName(), metadataEntry.getEntries());
+                    return metadataMultiMap;
+                }, (metadataMultimap1, metadataMultimap2) -> {
+                    metadataMultimap1.putAll(metadataMultimap2);
+                    return metadataMultimap1;
+                });
+    }
+
+}

Added: james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractorTest.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractorTest.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractorTest.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/DefaultTextExtractorTest.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,58 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.io.InputStream;
+import java.util.Optional;
+
+import org.junit.Before;
+import org.junit.Test;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class DefaultTextExtractorTest {
+    private TextExtractor textExtractor;
+
+    @Before
+    public void setUp() {
+        textExtractor = new DefaultTextExtractor();
+    }
+
+    @Test
+    public void textTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt");
+        assertThat(inputStream).isNotNull();
+        assertThat(textExtractor.extractContent(inputStream, Optional.of("text/plain"), Optional.of("Text.txt"))
+            .getTextualContent()
+            .get())
+            .isEqualTo("This is some awesome text text.\n\n");
+    }
+
+    @Test
+    public void textMicrosoftWorldTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx");
+        assertThat(inputStream).isNotNull();
+        assertThat(textExtractor.extractContent(
+            inputStream,
+            Optional.of("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+            Optional.of("writter.docx"))
+            .getTextualContent())
+            .isEmpty();
+    }
+}

Added: james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractorTest.java
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractorTest.java?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractorTest.java
(added)
+++ james/mailbox/trunk/elasticsearch/src/test/java/org/apache/james/mailbox/elasticsearch/json/extractor/TikaTextExtractorTest.java
Mon Jun 29 08:45:01 2015
@@ -0,0 +1,179 @@
+/****************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one   *
+ * or more contributor license agreements.  See the NOTICE file *
+ * distributed with this work for additional information        *
+ * regarding copyright ownership.  The ASF licenses this file   *
+ * to you under the Apache License, Version 2.0 (the            *
+ * "License"); you may not use this file except in compliance   *
+ * with the License.  You may obtain a copy of the License at   *
+ *                                                              *
+ *   http://www.apache.org/licenses/LICENSE-2.0                 *
+ *                                                              *
+ * Unless required by applicable law or agreed to in writing,   *
+ * software distributed under the License is distributed on an  *
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY       *
+ * KIND, either express or implied.  See the License for the    *
+ * specific language governing permissions and limitations      *
+ * under the License.                                           *
+ ****************************************************************/
+
+package org.apache.james.mailbox.elasticsearch.json.extractor;
+
+import java.io.InputStream;
+import java.util.Optional;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import org.junit.Before;
+import org.junit.Test;
+
+public class TikaTextExtractorTest {
+    
+    private TextExtractor textExtractor;
+    
+    @Before
+    public void setUp() {
+        textExtractor = new TikaTextExtractor();
+    }
+    
+    @Test
+    public void textTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/Text.txt");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("text/plain"),
+                Optional.of("Text.txt"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("This is some awesome text text.\n\n\n");
+    }
+
+    @Test
+    public void textMicrosoftWorldTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.docx");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+                Optional.of("writter.docx"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("This is an awesome document on libroffice writter !\n");
+    }
+
+    @Test
+    public void textOdtTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/writter.odt");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.oasis.opendocument.text"),
+                Optional.of("writter.odt"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("This is an awesome document on libroffice writter !\n");
+    }
+
+    @Test
+    public void documentWithBadDeclaredMetadataShouldBeWellHandled() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/fake.txt");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.oasis.opendocument.text"),
+                Optional.of("writter.odt"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("This is an awesome document on libroffice writter !\n");
+    }
+    
+    @Test
+    public void slidePowerPointTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.pptx");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+                Optional.of("slides.pptx"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("James is awesome\nIt manages attachments so well !\n");
+    }
+
+    @Test
+    public void slideOdpTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/slides.odp");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.oasis.opendocument.presentation"),
+                Optional.of("slides.odp"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("James is awesome\n\nIt manages attachments so well !\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n");
+    }
+    
+    @Test
+    public void pdfTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/PDF.pdf");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/pdf"),
+                Optional.of("PDF.pdf"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("\nThis is an awesome document on libroffice writter !\n\n\n");
+    }
+    
+    @Test
+    public void odsTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.ods");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.oasis.opendocument.spreadsheet"),
+                Optional.of("calc.ods"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("\tThis is an aesome LibreOffice document !\n" +
+                "\n" +
+                "\n" +
+                "???\n" +
+                "Page \n" +
+                "??? (???)\n" +
+                "00/00/0000, 00:00:00\n" +
+                "Page  / \n");
+    }
+    
+    @Test
+    public void excelTest() throws Exception {
+        InputStream inputStream = ClassLoader.getSystemResourceAsStream("documents/calc.xlsx");
+        assertThat(inputStream).isNotNull();
+        assertThat(
+            textExtractor.extractContent(
+                inputStream,
+                Optional.of("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+                Optional.of("calc.xlsx"))
+                .getTextualContent()
+                .get())
+            .isEqualTo("Feuille1\n" +
+                "\tThis is an aesome LibreOffice document !\n" +
+                "\n" +
+                "&A\t\n" +
+                "\n" +
+                "Page &P\t\n" +
+                "\n" +
+                "\n");
+    }
+    
+}

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/PDF.pdf
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/PDF.pdf?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt?rev=1688145&view=auto
==============================================================================
--- james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt (added)
+++ james/mailbox/trunk/elasticsearch/src/test/resources/documents/Text.txt Mon Jun 29 08:45:01
2015
@@ -0,0 +1,2 @@
+This is some awesome text text.
+

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.ods
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.ods?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.xlsx
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/calc.xlsx?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/fake.txt
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/fake.txt?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.odp
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.odp?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.pptx
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/slides.pptx?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.docx
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.docx?rev=1688145&view=auto
==============================================================================
    (empty)

Added: james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.odt
URL: http://svn.apache.org/viewvc/james/mailbox/trunk/elasticsearch/src/test/resources/documents/writter.odt?rev=1688145&view=auto
==============================================================================
    (empty)



---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org


Mime
View raw message