nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2365) HTTP Redirects to SubDomains don't get crawled if db.ignore.external.links.mode == byDomain
Date Mon, 18 Dec 2017 16:30:01 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2365?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16295210#comment-16295210
] 

ASF GitHub Bot commented on NUTCH-2365:
---------------------------------------

sebastian-nagel closed pull request #264: NUTCH-2365 Fetcher to respect db.ignore.external.links.mode
for redirects
URL: https://github.com/apache/nutch/pull/264
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index f386527a2..3d12be282 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -572,7 +572,7 @@
   <value>false</value>
   <description>If true, outlinks leading from a page to internal hosts or domain
   will be ignored. This is an effective way to limit the crawl to include
-  only initially injected hosts, without creating complex URLFilters.
+  only initially injected hosts or domains, without creating complex URLFilters.
   See 'db.ignore.external.links.mode'.
   </description>
 </property>
@@ -582,11 +582,21 @@
   <value>false</value>
   <description>If true, outlinks leading from a page to external hosts or domain
   will be ignored. This is an effective way to limit the crawl to include
-  only initially injected hosts, without creating complex URLFilters.
+  only initially injected hosts or domains, without creating complex URLFilters.
   See 'db.ignore.external.links.mode'.
   </description>
 </property>
 
+<property>
+  <name>db.ignore.also.redirects</name>
+  <value>true</value>
+  <description>If true, the fetcher checks redirects the same way as
+  links when ignoring internal or external links. Set to false to
+  follow redirects despite the values for db.ignore.external.links and
+  db.ignore.internal.links.
+  </description>
+</property>
+
 <property>
   <name>db.ignore.external.links.mode</name>
   <value>byHost</value>
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 42d5d5077..6c70186a6 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -92,6 +92,7 @@
   private int redirectCount;
   private boolean ignoreInternalLinks;
   private boolean ignoreExternalLinks;
+  private boolean ignoreAlsoRedirects;
   private String ignoreExternalLinksMode;
 
   // Used by fetcher.follow.outlinks.depth in parse
@@ -207,6 +208,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads,
FetchItemQ
     interval = conf.getInt("db.fetch.interval.default", 2592000);
     ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
     ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
+    ignoreAlsoRedirects = conf.getBoolean("db.ignore.also.redirects", true);
     ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
     maxOutlinkDepth = conf.getInt("fetcher.follow.outlinks.depth", -1);
     outlinksIgnoreExternal = conf.getBoolean(
@@ -484,69 +486,72 @@ private Text handleRedirect(Text url, CrawlDatum datum, String urlString,
     newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
     newUrl = urlFilters.filter(newUrl);
 
-    try {
-      String origHost = new URL(urlString).getHost().toLowerCase();
-      String newHost = new URL(newUrl).getHost().toLowerCase();
-      if (ignoreExternalLinks) {
-        if (!origHost.equals(newHost)) {
-          if (LOG.isDebugEnabled()) {
-            LOG.debug(" - ignoring redirect " + redirType + " from "
-                + urlString + " to " + newUrl
-                + " because external links are ignored");
+    if (newUrl == null || newUrl.equals(urlString)) {
+      LOG.debug(" - {} redirect skipped: {}", redirType,
+          (newUrl != null ? "to same url" : "filtered"));
+      return null;
+    }
+
+    if (ignoreAlsoRedirects && (ignoreExternalLinks || ignoreInternalLinks)) {
+      try {
+        URL origUrl = new URL(urlString);
+        URL redirUrl = new URL(newUrl);
+        if (ignoreExternalLinks) {
+          String origHostOrDomain, newHostOrDomain;
+          if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+            origHostOrDomain = URLUtil.getDomainName(origUrl).toLowerCase();
+            newHostOrDomain = URLUtil.getDomainName(redirUrl).toLowerCase();
+          } else {
+            // byHost
+            origHostOrDomain = origUrl.getHost().toLowerCase();
+            newHostOrDomain = redirUrl.getHost().toLowerCase();
           }
-          return null;
-        }
-      }
-      
-      if (ignoreInternalLinks) {
-        if (origHost.equals(newHost)) {
-          if (LOG.isDebugEnabled()) {
-            LOG.debug(" - ignoring redirect " + redirType + " from "
-                + urlString + " to " + newUrl
-                + " because internal links are ignored");
+          if (!origHostOrDomain.equals(newHostOrDomain)) {
+            LOG.debug(
+                " - ignoring redirect {} from {} to {} because external links are ignored",
+                redirType, urlString, newUrl);
+            return null;
           }
-          return null;
-        }
-      }
-    } catch (MalformedURLException e) { }
-    
-    if (newUrl != null && !newUrl.equals(urlString)) {
-      reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
-      url = new Text(newUrl);
-      if (maxRedirect > 0) {
-        redirecting = true;
-        redirectCount++;
-        if (LOG.isDebugEnabled()) {
-          LOG.debug(" - " + redirType + " redirect to " + url
-              + " (fetching now)");
-        }
-        return url;
-      } else {
-        CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
-            datum.getFetchInterval(), datum.getScore());
-        // transfer existing metadata
-        newDatum.getMetaData().putAll(datum.getMetaData());
-        try {
-          scfilters.initialScore(url, newDatum);
-        } catch (ScoringFilterException e) {
-          e.printStackTrace();
-        }
-        if (reprUrl != null) {
-          newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
-              new Text(reprUrl));
         }
-        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
-        if (LOG.isDebugEnabled()) {
-          LOG.debug(" - " + redirType + " redirect to " + url
-              + " (fetching later)");
+
+        if (ignoreInternalLinks) {
+          String origHost = origUrl.getHost().toLowerCase();
+          String newHost = redirUrl.getHost().toLowerCase();
+          if (origHost.equals(newHost)) {
+            LOG.debug(
+                " - ignoring redirect {} from {} to {} because internal links are ignored",
+                redirType, urlString, newUrl);
+            return null;
+          }
         }
+      } catch (MalformedURLException e) {
         return null;
       }
+    }
+
+    reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
+    url = new Text(newUrl);
+    if (maxRedirect > 0) {
+      redirecting = true;
+      redirectCount++;
+      LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
+      return url;
     } else {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug(" - " + redirType + " redirect skipped: "
-            + (newUrl != null ? "to same url" : "filtered"));
+      CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
+          datum.getFetchInterval(), datum.getScore());
+      // transfer existing metadata
+      newDatum.getMetaData().putAll(datum.getMetaData());
+      try {
+        scfilters.initialScore(url, newDatum);
+      } catch (ScoringFilterException e) {
+        e.printStackTrace();
+      }
+      if (reprUrl != null) {
+        newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+            new Text(reprUrl));
       }
+      output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
+      LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
       return null;
     }
   }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> HTTP Redirects to SubDomains don't get crawled if db.ignore.external.links.mode == byDomain
> -------------------------------------------------------------------------------------------
>
>                 Key: NUTCH-2365
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2365
>             Project: Nutch
>          Issue Type: Bug
>          Components: fetcher
>    Affects Versions: 1.12
>         Environment: Fedora 25
>            Reporter: Sriram Nookala
>            Assignee: Sebastian Nagel
>             Fix For: 1.14
>
>
> Crawling a domain  http://www.mercenarytrader.com which redirects to https://members.mercenarytrader.com
which doesn't get followed by Nutch even though 'db.ignore.external.links' is set to 'true'
and 'db.ignore.external.links.mode' is set to 'byDomain'. 
>   The bug is in FetcherThread where the comparison is by host and not by domain
> String origHost = new URL(urlString).getHost().toLowerCase();
> >       String newHost = new URL(newUrl).getHost().toLowerCase();
> >       if (ignoreExternalLinks) {
> >         if (!origHost.equals(newHost)) {
> >           if (LOG.isDebugEnabled()) {
> >             LOG.debug(" - ignoring redirect " + redirType + " from "
> >                 + urlString + " to " + newUrl
> >                 + " because external links are ignored");
> >           }
> >           return null;
> >         }
> >       }



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message