nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mar...@apache.org
Subject [nutch] 01/03: NUTCH-2692 Subcollection to support case-insensitive white and black lists
Date Fri, 22 Feb 2019 15:49:06 GMT
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git

commit 89c41e1b5a245322b27e8dd0728b543faa171e9d
Author: Markus Jelsma <markus@apache.org>
AuthorDate: Fri Feb 22 16:44:25 2019 +0100

    NUTCH-2692 Subcollection to support case-insensitive white and black lists
---
 conf/nutch-default.xml                                      |  8 ++++++++
 .../src/java/org/apache/nutch/collection/Subcollection.java | 13 ++++++++++++-
 .../indexer/subcollection/SubcollectionIndexingFilter.java  |  6 ++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index a42e6a9..69fbb7d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2407,6 +2407,14 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+<property>
+  <name>subcollection.case.insensitive</name>
+  <value>false</value>
+  <description>
+  Whether the URL prefixes are to be treated case insensitive.
+  </description>
+</property>
+
 <!-- Headings plugin properties -->
 
 <property>
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
index 13064eb..8478390 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
@@ -69,6 +69,11 @@ public class Subcollection extends Configured implements URLFilter {
    * SubCollection blacklist as String
    */
   String blString;
+  
+  /**
+   * Whether the white and black lists are case sensitive
+   */
+  boolean caseInsensitive = false;
 
   /**
    * public Constructor
@@ -95,10 +100,12 @@ public class Subcollection extends Configured implements URLFilter {
     this.id = id;
     this.key = key;
     this.name = name;
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   public Subcollection(Configuration conf) {
     super(conf);
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   /**
@@ -231,7 +238,11 @@ public class Subcollection extends Configured implements URLFilter {
 
     while (st.hasMoreElements()) {
       String line = (String) st.nextElement();
-      list.add(line.trim());
+      line = line.trim();
+      if (caseInsensitive) {
+        line = line.toLowerCase();
+      }
+      list.add(line);
     }
   }
 
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index 898d314..767d54d 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -36,6 +36,7 @@ public class SubcollectionIndexingFilter extends Configured implements
     IndexingFilter {
 
   private Configuration conf;
+  private boolean caseInsensitive = false;
 
   public SubcollectionIndexingFilter() {
     super(NutchConfiguration.create());
@@ -52,7 +53,9 @@ public class SubcollectionIndexingFilter extends Configured implements
     this.conf = conf;
     fieldName = conf.get("subcollection.default.fieldname", "subcollection");
     metadataSource = conf.get("subcollection.metadata.source", "subcollection");
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
+  
 
   /**
    * @return Configuration
@@ -102,6 +105,9 @@ public class SubcollectionIndexingFilter extends Configured implements
     }
     
     String sUrl = url.toString();
+    if (caseInsensitive) {
+      sUrl = sUrl.toLowerCase();
+    }
     addSubCollectionField(doc, sUrl);
     return doc;
   }


Mime
View raw message