nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-1465) Support sitemaps in Nutch
Date Thu, 27 Apr 2017 13:30:05 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-1465?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15986615#comment-15986615
] 

ASF GitHub Bot commented on NUTCH-1465:
---------------------------------------

sebastian-nagel commented on a change in pull request #189: NUTCH-1465 Support sitemaps in
Nutch
URL: https://github.com/apache/nutch/pull/189#discussion_r113689552
 
 

 ##########
 File path: src/java/org/apache/nutch/util/SitemapProcessor.java
 ##########
 @@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.io.IOException;
+import java.net.URL;
+import java.text.SimpleDateFormat;
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.hostdb.HostDatum;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.sitemaps.AbstractSiteMap;
+import crawlercommons.sitemaps.SiteMap;
+import crawlercommons.sitemaps.SiteMapIndex;
+import crawlercommons.sitemaps.SiteMapParser;
+import crawlercommons.sitemaps.SiteMapURL;
+
+/**
+ * <p>Performs Sitemap processing by fetching sitemap links, parsing the content and
merging
+ * the urls from Sitemap (with the metadata) with the existing crawldb.</p>
+ *
+ * <p>There are two use cases supported in Nutch's Sitemap processing:</p>
+ * <ol>
+ *  <li>Sitemaps are considered as "remote seed lists". Crawl administrators can prepare
a
+ *     list of sitemap links and get only those sitemap pages. This suits well for targeted
+ *     crawl of specific hosts.</li>
+ *  <li>For open web crawl, it is not possible to track each host and get the sitemap
links
+ *     manually. Nutch would automatically get the sitemaps for all the hosts seen in the
+ *     crawls and inject the urls from sitemap to the crawldb.</li>
+ * </ol>
+ *
+ * <p>For more details see:
+ *      https://wiki.apache.org/nutch/SitemapFeature </p>
+ */
+public class SitemapProcessor extends Configured implements Tool {
+  public static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class);
+  public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
+
+  public static final String CURRENT_NAME = "current";
+  public static final String LOCK_NAME = ".locked";
+  public static final String SITEMAP_STRICT_PARSING = "sitemap.strict.parsing";
+  public static final String SITEMAP_URL_FILTERING = "sitemap.url.filter";
+  public static final String SITEMAP_URL_NORMALIZING = "sitemap.url.normalize";
+
+  private static class SitemapMapper extends Mapper<Text, Writable, Text, CrawlDatum>
{
+    private ProtocolFactory protocolFactory = null;
+    private boolean strict = true;
+    private boolean filter = true;
+    private boolean normalize = true;
+    private URLFilters filters = null;
+    private URLNormalizers normalizers = null;
+    private CrawlDatum datum = new CrawlDatum();
+    private SiteMapParser parser = null;
+
+    public void setup(Context context) {
+      Configuration conf = context.getConfiguration();
+      this.protocolFactory = new ProtocolFactory(conf);
+      this.filter = conf.getBoolean(SITEMAP_URL_FILTERING, true);
+      this.normalize = conf.getBoolean(SITEMAP_URL_NORMALIZING, true);
+      this.strict = conf.getBoolean(SITEMAP_STRICT_PARSING, true);
+      this.parser = new SiteMapParser(strict);
+
+      if (filter)
+        filters = new URLFilters(conf);
+      if (normalize)
+        normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
+    }
+
+    public void map(Text key, Writable value, Context context) throws IOException, InterruptedException
{
+      String url;
+
+      try {
+        if (value instanceof CrawlDatum) {
+          // If its an entry from CrawlDb, emit it. It will be merged in the reducer
+          context.write(key, (CrawlDatum) value);
+        }
+        else if (value instanceof HostDatum) {
+          // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
+          // extract urls and emit those
+
+          // try different combinations of schemes one by one till we get rejection in all
cases
+          String host = key.toString();
+          if((url = filterNormalize("http://" + host + "/")) == null &&
+              (url = filterNormalize("https://" + host + "/")) == null &&
+              (url = filterNormalize("ftp://" + host + "/")) == null &&
+              (url = filterNormalize("file:/" + host + "/")) == null) {
+            context.getCounter("Sitemap", "filtered_records").increment(1);
+            return;
+          }
+
+          BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url),
datum, new LinkedList<>());
 
 Review comment:
   It's safe to pass null unless you want to use the robots.txt content.
 
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Support sitemaps in Nutch
> -------------------------
>
>                 Key: NUTCH-1465
>                 URL: https://issues.apache.org/jira/browse/NUTCH-1465
>             Project: Nutch
>          Issue Type: New Feature
>          Components: parser
>            Reporter: Lewis John McGibbney
>            Assignee: Lewis John McGibbney
>             Fix For: 1.14
>
>         Attachments: NUTCH-1465-sitemapinjector-trunk-v1.patch, NUTCH-1465-trunk.v1.patch,
NUTCH-1465-trunk.v2.patch, NUTCH-1465-trunk.v3.patch, NUTCH-1465-trunk.v4.patch, NUTCH-1465-trunk.v5.patch
>
>
> I recently came across this rather stagnant codebase[0] which is ASL v2.0 licensed and
appears to have been used successfully to parse sitemaps as per the discussion here[1].
> [0] http://sourceforge.net/projects/sitemap-parser/
> [1] http://lucene.472066.n3.nabble.com/Support-for-Sitemap-Protocol-and-Canonical-URLs-td630060.html



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

Mime
View raw message