tika-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (TIKA-2455) Flag in metadata for alternative email bodies
Date Tue, 24 Oct 2017 00:38:00 GMT

    [ https://issues.apache.org/jira/browse/TIKA-2455?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16216137#comment-16216137
] 

ASF GitHub Bot commented on TIKA-2455:
--------------------------------------

tballison closed pull request #205: TIKA-2455: flag the containing multipart type
URL: https://github.com/apache/tika/pull/205
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index 0a8fb80c7..38bd7015d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -36,6 +36,10 @@
     
     String MESSAGE_BCC = "Message-Bcc";
 
+    String MULTIPART_SUBTYPE = "Multipart-Subtype";
+
+    String MULTIPART_BOUNDARY = "Multipart-Boundary";
+
     /**
      * Where possible, this records the value from the name field.
      * Even in MAPI messages, though, this can be an email address.
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 7476347d5..b3d8644d0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -56,6 +56,7 @@
 import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -131,6 +132,7 @@ private static DateFormat createDateFormat(String format, TimeZone timezone,
boo
     private EmbeddedDocumentExtractor extractor;
 
     private boolean inPart = false;
+    private BodyDescriptor part = null;
 
     MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context,
boolean strictParsing) {
         this.handler = xhtml;
@@ -154,6 +156,11 @@ public void body(BodyDescriptor body, InputStream is) throws MimeException,
         submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
         submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
 
+        // TIKA-2455: flag the containing type.
+        if (inPart) {
+            submd.set(Message.MULTIPART_SUBTYPE, part.getSubType());
+            submd.set(Message.MULTIPART_BOUNDARY, part.getBoundary());
+        }   
         if (body instanceof MaximalBodyDescriptor) {
             MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
             String contentDispositionType = maximalBody.getContentDispositionType();
@@ -216,6 +223,7 @@ public void endMessage() throws MimeException {
 
     public void endMultipart() throws MimeException {
         inPart = false;
+        part = null;
     }
 
     public void epilogue(InputStream is) throws MimeException, IOException {
@@ -274,6 +282,16 @@ public void field(Field field) throws MimeException {
                 processAddressList(parsedField, "Cc:", Metadata.MESSAGE_CC);
             } else if (fieldname.equalsIgnoreCase("BCC")) {
                 processAddressList(parsedField, "Bcc:", Metadata.MESSAGE_BCC);
+            } else if (fieldname.equalsIgnoreCase("Content-Type")) {
+                final MediaType contentType = MediaType.parse(parsedField.getBody());
+
+                if (contentType.getType().equalsIgnoreCase("multipart")) {
+                    metadata.set(Message.MULTIPART_SUBTYPE, contentType.getSubtype());
+                    metadata.set(Message.MULTIPART_BOUNDARY, contentType.getParameters().get("boundary"));
+                } else {
+                    metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX + parsedField.getName(),
+                            field.getBody());
+                }
             } else if (fieldname.equalsIgnoreCase("Date")) {
                 DateTimeField dateField = (DateTimeField) parsedField;
                 Date date = dateField.getDate();
@@ -373,6 +391,7 @@ public void startHeader() throws MimeException {
 
     public void startMultipart(BodyDescriptor descr) throws MimeException {
         inPart = true;
+        part = descr;
     }
 
     private String stripOutFieldPrefix(Field field, String fieldname) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 03f7f2065..5f939615b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -574,4 +574,59 @@ public void testEmbeddedMetadata() throws Exception {
         assertEquals("logo.gif", seenMetadata.get(3).get(Metadata.RESOURCE_NAME_KEY));
         assertEquals("image/gif", seenMetadata.get(3).get(Metadata.CONTENT_TYPE));
     }
+
+    @Test
+    public void testMultipartFlags() throws Exception {
+        final ContentHandler handler = new BodyContentHandler();
+        final Metadata metadata = new Metadata();
+        final Parser parser = new RFC822Parser();
+        final ParseContext context = new ParseContext();
+        final Parser autoDetectParser = new AutoDetectParser();
+
+        final List<Metadata> metadataList = new ArrayList<Metadata>();
+
+        context.set(EmbeddedDocumentExtractor.class, new EmbeddedDocumentExtractor() {
+
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+
+            @Override
+            public void parseEmbedded(InputStream stream, ContentHandler handler,
+                                      Metadata metadata, boolean outputHtml) throws SAXException,
+                    IOException {
+                try {
+                    autoDetectParser.parse(stream, new BodyContentHandler(), metadata, new
ParseContext());
+                } catch (TikaException e) {
+                    throw new RuntimeException(e);
+                }
+
+                metadataList.add(metadata);
+            }
+        });
+
+        try (InputStream stream = getStream("test-documents/testRFC822-multipart")) {
+            parser.parse(stream, handler, metadata, context);
+        }
+
+        // Check the root metadata.
+        assertTrue(metadata.get(Message.MULTIPART_SUBTYPE).equals("mixed"));
+        assertTrue(metadata.get(Message.MULTIPART_BOUNDARY).equals("0016e64606800312ee04913db790"));
+
+        // Check the metadata of the first alternative.
+        assertTrue(metadataList.get(0).get(Metadata.CONTENT_TYPE).equals("text/plain; charset=UTF-8"));
+        assertTrue(metadataList.get(0).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
+        assertTrue(metadataList.get(0).get(Message.MULTIPART_BOUNDARY).equals("0016e64606800312ea04913db78e"));
+
+        // Check the metadata of the second alternative.
+        assertTrue(metadataList.get(1).get(Metadata.CONTENT_TYPE).equals("text/html; charset=UTF-8"));
+        assertTrue(metadataList.get(1).get(Message.MULTIPART_SUBTYPE).equals("alternative"));
+        assertTrue(metadataList.get(1).get(Message.MULTIPART_BOUNDARY).equals("0016e64606800312ea04913db78e"));
+
+        // Check the metadata of the attached GIF.
+        assertTrue(metadataList.get(2).get(Metadata.CONTENT_TYPE).equals("image/gif"));
+        assertTrue(metadataList.get(2).get(Message.MULTIPART_SUBTYPE) == null);
+        assertTrue(metadataList.get(2).get(Message.MULTIPART_BOUNDARY) == null);
+    }
 }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Flag in metadata for alternative email bodies
> ---------------------------------------------
>
>                 Key: TIKA-2455
>                 URL: https://issues.apache.org/jira/browse/TIKA-2455
>             Project: Tika
>          Issue Type: Improvement
>          Components: parser
>    Affects Versions: 1.16
>            Reporter: Matthew Caruana Galizia
>            Priority: Minor
>              Labels: attachments, multipart, rfc822, rfc822parser
>
> When multipart RFC822 emails are being parsed, there's no way to distinguish between
alternative versions of the body and attachments.
> It would be ideal if some kind of flag were set in the metadata passed to the {{EmbeddedDocumentExtractor}}
that indicates that the stream is an alternative.
> In GUIs that present the data extracted from the email, alternative bodies can be distinguished
from attachments and presented separately.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message