nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2611) Add line-breaks when parsing HTML block-level elements
Date Thu, 28 Jun 2018 09:09:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2611?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16526140#comment-16526140
] 

ASF GitHub Bot commented on NUTCH-2611:
---------------------------------------

sebastian-nagel closed pull request #354: NUTCH-2611: Add line-breaks when parsing HTML block-level
elements
URL: https://github.com/apache/nutch/pull/354
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index cb2d2df50..ccce56b20 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1388,6 +1388,16 @@
 		be set in the metadata with this name into the outlink</description>
 </property>
 
+<property>
+  <name>parser.html.line.separators</name>
+  <value>article,aside,blockquote,canvas,dd,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,li,main,nav,noscript,ol,output,p,pre,section,table,tfoot,ul,video</value>
+ <description>Comma separated list of HTML tags. Newline will be added to the
+  parsed text after these tages.
+  The default list above are the block-level HTML elements.
+  Tags must be in lower case.
+  To disable this feature, leave the list empty.</description>
+</property>
+
 <property>
   <name>htmlparsefilter.order</name>
   <value></value>
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 731003c88..95a419add 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -22,6 +22,8 @@
 import java.util.Collection;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Set;
 
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
@@ -44,6 +46,7 @@
   
   private String srcTagMetaName;
   private boolean keepNodenames;
+  private Set<String> blockNodes;
 
   public static class LinkParams {
     public String elName;
@@ -99,6 +102,7 @@ public void setConf(Configuration conf) {
     srcTagMetaName = this.conf
         .get("parser.html.outlinks.htmlnode_metadata_name");
     keepNodenames = (srcTagMetaName != null && srcTagMetaName.length() > 0);
+    blockNodes = new HashSet<>(conf.getTrimmedStringCollection("parser.html.line.separators"));
   }
 
   /**
@@ -145,6 +149,13 @@ private boolean getTextHelper(StringBuffer sb, Node node,
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
+      Node previousSibling = currentNode.getPreviousSibling();
+      if (previousSibling != null
+          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
+        appendParagraphSeparator(sb);
+      } else if (blockNodes.contains(nodeName.toLowerCase())) {
+        appendParagraphSeparator(sb);
+      }
 
       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index d40958912..34da6a014 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -24,6 +24,7 @@
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MapWritable;
@@ -47,6 +48,7 @@
 
   private String srcTagMetaName;
   private boolean keepNodenames;
+  private Set<String> blockNodes;
 
   private static class LinkParams {
     private String elName;
@@ -104,6 +106,7 @@ public void setConf(Configuration conf) {
     srcTagMetaName = this.conf
         .get("parser.html.outlinks.htmlnode_metadata_name");
     keepNodenames = (srcTagMetaName != null && srcTagMetaName.length() > 0);
+    blockNodes = new HashSet<>(conf.getTrimmedStringCollection("parser.html.line.separators"));
   }
 
   /**
@@ -150,6 +153,13 @@ private boolean getTextHelper(StringBuffer sb, Node node,
       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
+      Node previousSibling = currentNode.getPreviousSibling();
+      if (previousSibling != null
+          && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
+        appendParagraphSeparator(sb);
+      } else if (blockNodes.contains(nodeName.toLowerCase())) {
+        appendParagraphSeparator(sb);
+      }
 
       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Add line-breaks when parsing HTML block-level elements
> ------------------------------------------------------
>
>                 Key: NUTCH-2611
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2611
>             Project: Nutch
>          Issue Type: Improvement
>          Components: parser
>    Affects Versions: 1.14
>            Reporter: Yossi Tamari
>            Priority: Major
>             Fix For: 1.15
>
>
> Currently, the HTML and Tika parser only add newlines following text-nodes that contain
only whitespaces (e.g </span> <span>), but not based on what the tags are, so
for example a </div><div> will not add a new line.
> While some applications do not differentiate between a space and a new line, many others
see the semantic difference (two following words in the same sentence are "near", but in separate
sentences they are not).
> I believe adding newlines after block-level HTML elements, while not a panacea, will
be an improvement on the current behavior.
> NUTCH-2318 is related to this.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Mime
View raw message