james-server-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From btell...@apache.org
Subject [4/6] james-project git commit: JAMES-2018 Jsoup text extractor should well format lists
Date Thu, 01 Jun 2017 09:05:02 GMT
JAMES-2018 Jsoup text extractor should well format lists


Project: http://git-wip-us.apache.org/repos/asf/james-project/repo
Commit: http://git-wip-us.apache.org/repos/asf/james-project/commit/0172d4b8
Tree: http://git-wip-us.apache.org/repos/asf/james-project/tree/0172d4b8
Diff: http://git-wip-us.apache.org/repos/asf/james-project/diff/0172d4b8

Branch: refs/heads/master
Commit: 0172d4b8634aba7e8e6e322b7aae0cdc2e1fe155
Parents: 84d7a31
Author: benwa <btellier@linagora.com>
Authored: Thu May 25 10:00:18 2017 +0700
Committer: benwa <btellier@linagora.com>
Committed: Thu Jun 1 16:03:20 2017 +0700

----------------------------------------------------------------------
 .../jmap/utils/JsoupHtmlTextExtractor.java      | 46 +++++++++++++---
 .../jmap/utils/JsoupHtmlTextExtractorTest.java  | 58 ++++++++++++++++++++
 2 files changed, 97 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/james-project/blob/0172d4b8/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java
----------------------------------------------------------------------
diff --git a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java
b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java
index 912a617..a3ed036 100644
--- a/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java
+++ b/server/protocols/jmap/src/main/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractor.java
@@ -33,6 +33,10 @@ import org.slf4j.LoggerFactory;
 public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
 
     private static final Logger LOGGER = LoggerFactory.getLogger(JsoupHtmlTextExtractor.class);
+    public static final String BR_TAG = "br";
+    public static final String UL_TAG = "ul";
+    public static final String LI_TAG = "li";
+    public static final String P_TAG = "p";
 
     @Override
     public String toPlainText(String html) {
@@ -57,10 +61,16 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
         }
         if (node instanceof Element) {
             Element element = (Element) node;
-            if (element.tagName().equals("br")) {
+            if (element.tagName().equals(BR_TAG)) {
                 return "\n";
             }
-            if (element.tagName().equals("p")) {
+            if (element.tagName().equals(UL_TAG)) {
+                return "\n\n";
+            }
+            if (element.tagName().equals(LI_TAG)) {
+                return "\n - ";
+            }
+            if (element.tagName().equals(P_TAG)) {
                 return "\n\n";
             }
         }
@@ -68,11 +78,33 @@ public class JsoupHtmlTextExtractor implements HtmlTextExtractor {
     }
 
     Stream<Node> flatten(Node base) {
-        return Stream.concat(
-            base.childNodes()
-                .stream()
-                .flatMap(this::flatten),
-            Stream.of(base));
+        Position position = getPosition(base);
+        Stream<Node> flatChildren = base.childNodes()
+            .stream()
+            .flatMap(this::flatten);
+        switch (position) {
+            case PREFIX:
+                return Stream.concat(Stream.of(base), flatChildren);
+            case SUFFIX:
+                return Stream.concat(flatChildren, Stream.of(base));
+            default:
+                throw new RuntimeException("Unexpected POSITION for node element: " + position);
+        }
+    }
+
+    private enum Position {
+        PREFIX,
+        SUFFIX
+    }
+
+    private Position getPosition(Node node) {
+        if (node instanceof Element) {
+            Element element = (Element) node;
+            if (element.tagName().equals(LI_TAG)) {
+                return Position.PREFIX;
+            }
+        }
+        return Position.SUFFIX;
     }
 
 }

http://git-wip-us.apache.org/repos/asf/james-project/blob/0172d4b8/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java
----------------------------------------------------------------------
diff --git a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java
b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java
index 28e9d1d..4a413de 100644
--- a/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java
+++ b/server/protocols/jmap/src/test/java/org/apache/james/jmap/utils/JsoupHtmlTextExtractorTest.java
@@ -71,6 +71,64 @@ public class JsoupHtmlTextExtractorTest {
     }
 
     @Test
+    public void toPlainTextShouldHandleListsWell() {
+        String html = "<ul>Here is my awesome list:" +
+            "  <li>JMAP</li>" +
+            "  <li>IMAP</li>" +
+            "</ul>" +
+            "<p>Followed with some text</p>" +
+            "<p>And some other text</p>";
+        String expectedPlainText = "Here is my awesome list:  \n" +
+            " - JMAP  \n" +
+            " - IMAP\n" +
+            "\n" +
+            "Followed with some text\n" +
+            "\n" +
+            "And some other text\n" +
+            "\n";
+        assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
+    }
+
+    @Test
+    public void tableShouldBeWellHandled() {
+        String html = " <table style=\"width:100%\">\n" +
+            "  <tr>\n" +
+            "    <th>Firstname</th>\n" +
+            "    <th>Lastname</th>\n" +
+            "    <th>Age</th>\n" +
+            "  </tr>\n" +
+            "  <tr>\n" +
+            "    <td>Jill</td>\n" +
+            "    <td>Smith</td>\n" +
+            "    <td>50</td>\n" +
+            "  </tr>\n" +
+            "  <tr>\n" +
+            "    <td>Eve</td>\n" +
+            "    <td>Jackson</td>\n" +
+            "    <td>94</td>\n" +
+            "  </tr>\n" +
+            "</table> ";
+        String expectedPlainText = "\n" +
+            "  \n" +
+            "    Firstname\n" +
+            "    Lastname\n" +
+            "    Age\n" +
+            "  \n" +
+            "  \n" +
+            "    Jill\n" +
+            "    Smith\n" +
+            "    50\n" +
+            "  \n" +
+            "  \n" +
+            "    Eve\n" +
+            "    Jackson\n" +
+            "    94\n" +
+            "  \n" +
+            " ";
+        assertThat(textExtractor.toPlainText(html)).isEqualTo(expectedPlainText);
+    }
+
+    @Test
     public void nonClosedHtmlShouldBeTranslated() {
         String html = "This is an <b>HTML text !";
         String expectedPlainText = "This is an HTML text !";


---------------------------------------------------------------------
To unsubscribe, e-mail: server-dev-unsubscribe@james.apache.org
For additional commands, e-mail: server-dev-help@james.apache.org


Mime
View raw message