nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2565) MergeDB incorrectly handles unfetched CrawlDatums
Date Thu, 21 Jun 2018 14:44:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2565?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16519443#comment-16519443
] 

ASF GitHub Bot commented on NUTCH-2565:
---------------------------------------

sebastian-nagel closed pull request #311: - fix for NUTCH-2565 contributed by Jurian Broertjes
URL: https://github.com/apache/nutch/pull/311
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
index 38fde9f02..a3209894b 100755
--- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -157,7 +157,11 @@ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
    * @return the date as a long.
    */
   public long calculateLastFetchTime(CrawlDatum datum) {
-    return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+    if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) {
+      return 0L;
+    } else {
+      return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000;
+    }
   }
 
   /**
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index d8756fd5e..475ee855d 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -31,6 +31,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Job;
@@ -70,8 +71,6 @@
 
   public static class Merger extends
       Reducer<Text, CrawlDatum, Text, CrawlDatum> {
-    private org.apache.hadoop.io.MapWritable meta;
-    private CrawlDatum res = new CrawlDatum();
     private FetchSchedule schedule;
 
     public void close() throws IOException {
@@ -85,40 +84,40 @@ public void setup(Reducer.Context context) {
     public void reduce(Text key, Iterable<CrawlDatum> values,
         Context context)
         throws IOException, InterruptedException {
-      long resTime = 0L;
-      boolean resSet = false;
-      long valTime = 0L;
-      meta = new org.apache.hadoop.io.MapWritable();
+
+      CrawlDatum res = new CrawlDatum();
+      res.setFetchTime(-1); // We want everything to be newer!
+      MapWritable meta = new MapWritable();
+
       for (CrawlDatum val : values) {
-        if (!resSet) {
-          res.set(val);
-          resSet = true;
-          resTime = schedule.calculateLastFetchTime(res);
-          for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) {
-            meta.put(e.getKey(), e.getValue());
-          }
-          continue;
-        }
-        // compute last fetch time, and pick the latest
-        valTime = schedule.calculateLastFetchTime(val);
-        if (valTime > resTime) {
+        if (isNewer(res, val)) {
           // collect all metadata, newer values override older values
-          for (Entry<Writable, Writable> e : val.getMetaData().entrySet()) {
-            meta.put(e.getKey(), e.getValue());
-          }
+          meta = mergeMeta(val.getMetaData(), meta);
           res.set(val);
-          resTime = valTime;
         } else {
-          // insert older metadata before newer
-          for (Entry<Writable, Writable> e : meta.entrySet()) {
-            val.getMetaData().put(e.getKey(), e.getValue());
-          }
-          meta = val.getMetaData();
+          // overwrite older metadata with current metadata
+          meta = mergeMeta(meta, val.getMetaData());
         }
       }
+
       res.setMetaData(meta);
       context.write(key, res);
     }
+
+    // Determine which CrawlDatum is the latest, according to calculateLastFetchTime() 
+    // and getFetchTime() as fallback in case calculateLastFetchTime()s are equal (eg: DB_UNFETCHED)
+    private boolean isNewer(CrawlDatum cd1, CrawlDatum cd2) {
+      return schedule.calculateLastFetchTime(cd2) > schedule.calculateLastFetchTime(cd1)

+        || schedule.calculateLastFetchTime(cd2) == schedule.calculateLastFetchTime(cd1) 
+        && cd2.getFetchTime() > cd1.getFetchTime();
+    }
+
+    private MapWritable mergeMeta(MapWritable from, MapWritable to) {
+      for (Entry<Writable, Writable> e : from.entrySet()) {
+        to.put(e.getKey(), e.getValue());
+      }
+      return to;
+    }
   }
 
   public CrawlDbMerger() {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> MergeDB incorrectly handles unfetched CrawlDatums
> -------------------------------------------------
>
>                 Key: NUTCH-2565
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2565
>             Project: Nutch
>          Issue Type: Bug
>          Components: crawldb
>    Affects Versions: 1.14
>            Reporter: Jurian Broertjes
>            Priority: Minor
>             Fix For: 1.15
>
>
> I ran into this issue when merging a crawlDB originating from sitemaps into our normal
crawlDB. CrawlDatums are merged based on output of AbstractFetchSchedule::calculateLastFetchTime().
When CrawlDatums are unfetched, this can overwrite fetchTime or other stuff.
> I assume this is a bug and have a simple fix for it that checks if CrawlDatum has status
db_unfetched.
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Mime
View raw message