nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mar...@apache.org
Subject [nutch] branch master updated: NUTCH-2694 HostDB to aggregate by long instead of integer
Date Fri, 22 Feb 2019 13:08:36 GMT
This is an automated email from the ASF dual-hosted git repository.

markus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 33922fe  NUTCH-2694 HostDB to aggregate by long instead of integer
33922fe is described below

commit 33922feb804d740180fb4abd833884dae6d62cc0
Author: Markus Jelsma <markus@apache.org>
AuthorDate: Fri Feb 22 14:08:08 2019 +0100

    NUTCH-2694 HostDB to aggregate by long instead of integer
---
 CHANGES.txt                                        |   9 +-
 src/java/org/apache/nutch/hostdb/HostDatum.java    | 110 ++++++++++-----------
 .../org/apache/nutch/hostdb/ResolverThread.java    |   6 +-
 .../apache/nutch/hostdb/UpdateHostDbReducer.java   |  34 +++----
 4 files changed, 81 insertions(+), 78 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 96bd05a..12f5aad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,9 +6,12 @@ Comments
 
 Breaking Changes
 
-The value of crawl.gen.delay is now read in milliseconds as stated in the description
-in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842 for
-further information.
+    -  The value of crawl.gen.delay is now read in milliseconds as stated in the description
+       in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842
for
+       further information.
+
+    -  HostDB entries have been moved from Integer to Long in order to accomodate very large
+       hosts. Remove your existing HostDB and recreate it with bin/nutch updatehostdb.
 
 
 Nutch 1.15 Release (25/07/2018)
diff --git a/src/java/org/apache/nutch/hostdb/HostDatum.java b/src/java/org/apache/nutch/hostdb/HostDatum.java
index fe3b73e..2bc9244 100644
--- a/src/java/org/apache/nutch/hostdb/HostDatum.java
+++ b/src/java/org/apache/nutch/hostdb/HostDatum.java
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Writable;
 /**
  */
 public class HostDatum implements Writable, Cloneable {
-  protected int failures = 0;
+  protected long failures = 0;
   protected float score = 0;
   protected Date lastCheck = new Date(0);
   protected String homepageUrl = new String();
@@ -38,17 +38,17 @@ public class HostDatum implements Writable, Cloneable {
   protected MapWritable metaData = new MapWritable();
 
   // Records the number of times DNS look-up failed, may indicate host no longer exists
-  protected int dnsFailures = 0;
+  protected long dnsFailures = 0;
 
   // Records the number of connection failures, may indicate our netwerk being blocked by
firewall
-  protected int connectionFailures = 0;
+  protected long connectionFailures = 0;
 
-  protected int unfetched = 0;
-  protected int fetched = 0;
-  protected int notModified = 0;
-  protected int redirTemp = 0;
-  protected int redirPerm = 0;
-  protected int gone = 0;
+  protected long unfetched = 0;
+  protected long fetched = 0;
+  protected long notModified = 0;
+  protected long redirTemp = 0;
+  protected long redirPerm = 0;
+  protected long gone = 0;
 
   public HostDatum() {
   }
@@ -68,15 +68,15 @@ public class HostDatum implements Writable, Cloneable {
   }
 
   public void resetFailures() {
-    setDnsFailures(0);
-    setConnectionFailures(0);
+    setDnsFailures(0l);
+    setConnectionFailures(0l);
   }
 
-  public void setDnsFailures(Integer dnsFailures) {
+  public void setDnsFailures(Long dnsFailures) {
     this.dnsFailures = dnsFailures;
   }
 
-  public void setConnectionFailures(Integer connectionFailures) {
+  public void setConnectionFailures(Long connectionFailures) {
     this.connectionFailures = connectionFailures;
   }
 
@@ -88,15 +88,15 @@ public class HostDatum implements Writable, Cloneable {
     this.connectionFailures++;
   }
 
-  public Integer numFailures() {
+  public Long numFailures() {
     return getDnsFailures() + getConnectionFailures();
   }
 
-  public Integer getDnsFailures() {
+  public Long getDnsFailures() {
     return dnsFailures;
   }
 
-  public Integer getConnectionFailures() {
+  public Long getConnectionFailures() {
     return connectionFailures;
   }
 
@@ -120,7 +120,7 @@ public class HostDatum implements Writable, Cloneable {
     return score;
   }
 
-  public Integer numRecords() {
+  public Long numRecords() {
     return unfetched + fetched + gone + redirPerm + redirTemp + notModified;
   }
 
@@ -140,51 +140,51 @@ public class HostDatum implements Writable, Cloneable {
     this.homepageUrl = homepageUrl;
   }
 
-  public void setUnfetched(int val) {
+  public void setUnfetched(long val) {
     unfetched = val;
   }
 
-  public int getUnfetched() {
+  public long getUnfetched() {
     return unfetched;
   }
 
-  public void setFetched(int val) {
+  public void setFetched(long val) {
     fetched = val;
   }
 
-  public int getFetched() {
+  public long getFetched() {
     return fetched;
   }
 
-  public void setNotModified(int val) {
+  public void setNotModified(long val) {
     notModified = val;
   }
 
-  public int getNotModified() {
+  public long getNotModified() {
     return notModified;
   }
 
-  public void setRedirTemp(int val) {
+  public void setRedirTemp(long val) {
     redirTemp = val;
   }
 
-  public int getRedirTemp() {
+  public long getRedirTemp() {
     return redirTemp;
   }
 
-  public void setRedirPerm(int val) {
+  public void setRedirPerm(long val) {
     redirPerm = val;
   }
 
-  public int getRedirPerm() {
+  public long getRedirPerm() {
     return redirPerm;
   }
 
-  public void setGone(int val) {
+  public void setGone(long val) {
     gone = val;
   }
 
-  public int getGone() {
+  public long getGone() {
     return gone;
   }
 
@@ -249,15 +249,15 @@ public class HostDatum implements Writable, Cloneable {
     lastCheck = new Date(in.readLong());
     homepageUrl = Text.readString(in);
 
-    dnsFailures = in.readInt();
-    connectionFailures = in.readInt();
+    dnsFailures = in.readLong();
+    connectionFailures = in.readLong();
 
-    unfetched= in.readInt();
-    fetched= in.readInt();
-    notModified= in.readInt();
-    redirTemp= in.readInt();
-    redirPerm = in.readInt();
-    gone = in.readInt();
+    unfetched= in.readLong();
+    fetched= in.readLong();
+    notModified= in.readLong();
+    redirTemp= in.readLong();
+    redirPerm = in.readLong();
+    gone = in.readLong();
 
     metaData = new org.apache.hadoop.io.MapWritable();
     metaData.readFields(in);
@@ -269,15 +269,15 @@ public class HostDatum implements Writable, Cloneable {
     out.writeLong(lastCheck.getTime());
     Text.writeString(out, homepageUrl);
 
-    out.writeInt(dnsFailures);
-    out.writeInt(connectionFailures);
+    out.writeLong(dnsFailures);
+    out.writeLong(connectionFailures);
 
-    out.writeInt(unfetched);
-    out.writeInt(fetched);
-    out.writeInt(notModified);
-    out.writeInt(redirTemp);
-    out.writeInt(redirPerm);
-    out.writeInt(gone);
+    out.writeLong(unfetched);
+    out.writeLong(fetched);
+    out.writeLong(notModified);
+    out.writeLong(redirTemp);
+    out.writeLong(redirPerm);
+    out.writeLong(gone);
 
     metaData.write(out);
   }
@@ -285,25 +285,25 @@ public class HostDatum implements Writable, Cloneable {
   @Override
   public String toString() {
     StringBuilder buf = new StringBuilder();
-    buf.append(Integer.toString(getUnfetched()));
+    buf.append(Long.toString(getUnfetched()));
     buf.append("\t");
-    buf.append(Integer.toString(getFetched()));
+    buf.append(Long.toString(getFetched()));
     buf.append("\t");
-    buf.append(Integer.toString(getGone()));
+    buf.append(Long.toString(getGone()));
     buf.append("\t");
-    buf.append(Integer.toString(getRedirTemp()));
+    buf.append(Long.toString(getRedirTemp()));
     buf.append("\t");
-    buf.append(Integer.toString(getRedirPerm()));
+    buf.append(Long.toString(getRedirPerm()));
     buf.append("\t");
-    buf.append(Integer.toString(getNotModified()));
+    buf.append(Long.toString(getNotModified()));
     buf.append("\t");
-    buf.append(Integer.toString(numRecords()));
+    buf.append(Long.toString(numRecords()));
     buf.append("\t");
-    buf.append(Integer.toString(getDnsFailures()));
+    buf.append(Long.toString(getDnsFailures()));
     buf.append("\t");
-    buf.append(Integer.toString(getConnectionFailures()));
+    buf.append(Long.toString(getConnectionFailures()));
     buf.append("\t");
-    buf.append(Integer.toString(numFailures()));
+    buf.append(Long.toString(numFailures()));
     buf.append("\t");
     buf.append(Float.toString(score));
     buf.append("\t");
diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index fe66217..564e5da 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -71,7 +71,7 @@ public class ResolverThread implements Runnable {
       } else if (datum.getDnsFailures() > 0) {
         context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
         datum.setLastCheck();
-        datum.setDnsFailures(0);
+        datum.setDnsFailures(0l);
         LOG.info(host + ": rediscovered_host " + datum);
       } else {
         context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
@@ -86,7 +86,7 @@ public class ResolverThread implements Runnable {
         // If the counter is empty we'll initialize with date = today and 1 failure
         if (datum.isEmpty()) {
           datum.setLastCheck();
-          datum.setDnsFailures(1);
+          datum.setDnsFailures(1l);
           context.write(hostText, datum);
           context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
           LOG.info(host + ": new_unknown_host " + datum);
@@ -108,7 +108,7 @@ public class ResolverThread implements Runnable {
         }
 
         context.getCounter("UpdateHostDb",
-          Integer.toString(datum.numFailures()) + "_times_failed").increment(1);
+          Long.toString(datum.numFailures()) + "_times_failed").increment(1);
       } catch (Exception ioe) {
         LOG.warn(StringUtils.stringifyException(ioe));
       }
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 70ce3eb..a0030f4d 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -28,7 +28,7 @@ import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -118,10 +118,10 @@ public class UpdateHostDbReducer
   public void reduce(Text key, Iterable<NutchWritable> values,
     Context context) throws IOException, InterruptedException {
 
-    Map<String,Map<String,Integer>> stringCounts = new HashMap<>();
+    Map<String,Map<String,Long>> stringCounts = new HashMap<>();
     Map<String,Float> maximums = new HashMap<>();
     Map<String,Float> sums = new HashMap<>(); // used to calc averages
-    Map<String,Integer> counts = new HashMap<>(); // used to calc averages
+    Map<String,Long> counts = new HashMap<>(); // used to calc averages
     Map<String,Float> minimums = new HashMap<>();
     Map<String,TDigest> tdigests = new HashMap<String,TDigest>();
     
@@ -146,27 +146,27 @@ public class UpdateHostDbReducer
         // Set the correct status field
         switch (buffer.getStatus()) {
           case CrawlDatum.STATUS_DB_UNFETCHED:
-            hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
+            hostDatum.setUnfetched(hostDatum.getUnfetched() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_FETCHED:
-            hostDatum.setFetched(hostDatum.getFetched() + 1);
+            hostDatum.setFetched(hostDatum.getFetched() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_GONE:
-            hostDatum.setGone(hostDatum.getGone() + 1);
+            hostDatum.setGone(hostDatum.getGone() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_REDIR_TEMP:
-            hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
+            hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_REDIR_PERM:
-            hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
+            hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_NOTMODIFIED:
-            hostDatum.setNotModified(hostDatum.getNotModified() + 1);
+            hostDatum.setNotModified(hostDatum.getNotModified() + 1l);
             break;
         }
         
@@ -193,10 +193,10 @@ public class UpdateHostDbReducer
                 // Does the value exist?
                 if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                   // Yes, increment it
-                  stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue)
+ 1);
+                  stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue)
+ 1l);
                 } else {
                   // Create it!
-                  stringCounts.get(stringFields[i]).put(metadataValue, 1);
+                  stringCounts.get(stringFields[i]).put(metadataValue, 1l);
                 }
               }
             }
@@ -247,11 +247,11 @@ public class UpdateHostDbReducer
                   if (sums.containsKey(numericFields[i])) {
                     // Increment
                     sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
-                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
+                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1l);
                   } else {
                     // Create it!
                     sums.put(numericFields[i], metadataValue);
-                    counts.put(numericFields[i], 1);
+                    counts.put(numericFields[i], 1l);
                   }
                 } catch (Exception e) {
                   LOG.error(e.getMessage() + " when processing values for " + key.toString());
@@ -312,9 +312,9 @@ public class UpdateHostDbReducer
     }
     
     // Set metadata
-    for (Map.Entry<String, Map<String,Integer>> entry : stringCounts.entrySet())
{
-      for (Map.Entry<String,Integer> subEntry : entry.getValue().entrySet()) {
-        hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new
IntWritable(subEntry.getValue()));
+    for (Map.Entry<String, Map<String,Long>> entry : stringCounts.entrySet())
{
+      for (Map.Entry<String,Long> subEntry : entry.getValue().entrySet()) {
+        hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new
LongWritable(subEntry.getValue()));
       }
     }
     for (Map.Entry<String, Float> entry : maximums.entrySet()) {
@@ -326,7 +326,7 @@ public class UpdateHostDbReducer
     for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
       // Emit all percentiles
       for (int i = 0; i < percentiles.length; i++) {
-        hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "."
+ entry.getKey()), new FloatWritable((float)entry.getValue().quantile(0.5)));
+        hostDatum.getMetaData().put(new Text("pct" + Long.toString(percentiles[i]) + "."
+ entry.getKey()), new FloatWritable((float)entry.getValue().quantile(0.5)));
       }
     }      
     for (Map.Entry<String, Float> entry : minimums.entrySet()) {


Mime
View raw message