nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2451) protocol-ftp to resolve relative URL when following redirects
Date Tue, 05 Dec 2017 11:11:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2451?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16278408#comment-16278408
] 

ASF GitHub Bot commented on NUTCH-2451:
---------------------------------------

sebastian-nagel closed pull request #241: NUTCH-2451 protocol-ftp to resolve relative URL
when following redirects
URL: https://github.com/apache/nutch/pull/241
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index 309c2a4d2..a4f5786b2 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -30,6 +30,7 @@
 import javax.xml.parsers.ParserConfigurationException;
 
 import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.hadoop.conf.Configuration;
 import org.w3c.dom.Document;
@@ -49,7 +50,8 @@
   private static final String ATTR_CLASS = "class";
   private static final String ATTR_ID = "id";
 
-  public static final Logger LOG = PluginRepository.LOG;
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(PluginManifestParser.class);
 
   private static final boolean WINDOWS = System.getProperty("os.name")
       .startsWith("Windows");
@@ -71,7 +73,8 @@ public PluginManifestParser(Configuration conf,
    *          folders to search plugins from
    * @return A {@link Map} of all found {@link PluginDescriptor}s.
    */
-  public Map<String, PluginDescriptor> parsePluginFolder(String[] pluginFolders) {
+  public Map<String, PluginDescriptor> parsePluginFolder(
+      String[] pluginFolders) {
     Map<String, PluginDescriptor> map = new HashMap<>();
 
     if (pluginFolders == null) {
@@ -158,8 +161,8 @@ private PluginDescriptor parseManifestFile(String pManifestPath)
    * @throws ParserConfigurationException
    * @throws DocumentException
    */
-  private Document parseXML(URL url) throws SAXException, IOException,
-      ParserConfigurationException {
+  private Document parseXML(URL url)
+      throws SAXException, IOException, ParserConfigurationException {
     DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
     DocumentBuilder builder = factory.newDocumentBuilder();
     return builder.parse(url.openStream());
@@ -220,8 +223,8 @@ private void parseRequires(Element pRootElement, PluginDescriptor pDescriptor)
    * @param pDescriptor
    * @throws MalformedURLException
    */
-  private void parseLibraries(Element pRootElement, PluginDescriptor pDescriptor)
-      throws MalformedURLException {
+  private void parseLibraries(Element pRootElement,
+      PluginDescriptor pDescriptor) throws MalformedURLException {
     NodeList nodelist = pRootElement.getElementsByTagName("runtime");
     if (nodelist.getLength() > 0) {
 
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 50daa57fd..02ccb92ec 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -21,6 +21,9 @@
 import java.lang.reflect.Constructor;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
+import java.net.URL;
+import java.net.URLStreamHandler;
+import java.net.URLStreamHandlerFactory;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -43,8 +46,13 @@
  * descriptor represents all meta information about a plugin. So a plugin
  * instance will be created later when it is required, this allow lazy plugin
  * loading.
+ *
+ * As protocol-plugins need to be registered with the JVM as well, this class
+ * also acts as URLStreamHanderFactory that registers with the JVM and supports
+ * all the new protocols as if they were native. Details how the JVM creates
+ * URLs can be seen in the API documentation for the URL constructor.
  */
-public class PluginRepository {
+public class PluginRepository implements URLStreamHandlerFactory {
   private static final WeakHashMap<String, PluginRepository> CACHE = new WeakHashMap<>();
 
   private boolean auto;
@@ -90,9 +98,12 @@ public PluginRepository(Configuration conf) throws RuntimeException {
     try {
       installExtensions(fRegisteredPlugins);
     } catch (PluginRuntimeException e) {
-      LOG.error(e.toString());
+      LOG.error("Could not install extensions.", e.toString());
       throw new RuntimeException(e.getMessage());
     }
+
+    registerURLStreamHandlerFactory();
+
     displayStatus();
   }
 
@@ -120,7 +131,7 @@ private void installExtensionPoints(List<PluginDescriptor> plugins)
{
     for (PluginDescriptor plugin : plugins) {
       for (ExtensionPoint point : plugin.getExtenstionPoints()) {
         String xpId = point.getId();
-        LOG.debug("Adding extension point " + xpId);
+        LOG.debug("Adding extension point {}", xpId);
         fExtensionPoints.put(xpId, point);
       }
     }
@@ -137,9 +148,8 @@ private void installExtensions(List<PluginDescriptor> pRegisteredPlugins)
         String xpId = extension.getTargetPoint();
         ExtensionPoint point = getExtensionPoint(xpId);
         if (point == null) {
-          throw new PluginRuntimeException("Plugin ("
-              + descriptor.getPluginId() + "), " + "extension point: " + xpId
-              + " does not exist.");
+          throw new PluginRuntimeException("Plugin (" + descriptor.getPluginId()
+              + "), " + "extension point: " + xpId + " does not exist.");
         }
         point.addExtension(extension);
       }
@@ -149,8 +159,8 @@ private void installExtensions(List<PluginDescriptor> pRegisteredPlugins)
   private void getPluginCheckedDependencies(PluginDescriptor plugin,
       Map<String, PluginDescriptor> plugins,
       Map<String, PluginDescriptor> dependencies,
-      Map<String, PluginDescriptor> branch) throws MissingDependencyException,
-      CircularDependencyException {
+      Map<String, PluginDescriptor> branch)
+      throws MissingDependencyException, CircularDependencyException {
 
     if (dependencies == null) {
       dependencies = new HashMap<>();
@@ -164,8 +174,8 @@ private void getPluginCheckedDependencies(PluginDescriptor plugin,
     for (String id : plugin.getDependencies()) {
       PluginDescriptor dependency = plugins.get(id);
       if (dependency == null) {
-        throw new MissingDependencyException("Missing dependency " + id
-            + " for plugin " + plugin.getPluginId());
+        throw new MissingDependencyException(
+            "Missing dependency " + id + " for plugin " + plugin.getPluginId());
       }
       if (branch.containsKey(id)) {
         throw new CircularDependencyException("Circular dependency detected "
@@ -196,7 +206,8 @@ private void getPluginCheckedDependencies(PluginDescriptor plugin,
    * @return List
    */
   private List<PluginDescriptor> getDependencyCheckedPlugins(
-      Map<String, PluginDescriptor> filtered, Map<String, PluginDescriptor> all)
{
+      Map<String, PluginDescriptor> filtered,
+      Map<String, PluginDescriptor> all) {
     if (filtered == null) {
       return null;
     }
@@ -223,8 +234,8 @@ private void getPluginCheckedDependencies(PluginDescriptor plugin,
    * @return PluginDescriptor[]
    */
   public PluginDescriptor[] getPluginDescriptors() {
-    return fRegisteredPlugins.toArray(new PluginDescriptor[fRegisteredPlugins
-        .size()]);
+    return fRegisteredPlugins
+        .toArray(new PluginDescriptor[fRegisteredPlugins.size()]);
   }
 
   /**
@@ -278,10 +289,10 @@ public Plugin getPluginInstance(PluginDescriptor pDescriptor)
       synchronized (pDescriptor) {
         Class<?> pluginClass = getCachedClass(pDescriptor,
             pDescriptor.getPluginClass());
-        Constructor<?> constructor = pluginClass.getConstructor(new Class<?>[]
{
-            PluginDescriptor.class, Configuration.class });
-        Plugin plugin = (Plugin) constructor.newInstance(new Object[] {
-            pDescriptor, this.conf });
+        Constructor<?> constructor = pluginClass.getConstructor(
+            new Class<?>[] { PluginDescriptor.class, Configuration.class });
+        Plugin plugin = (Plugin) constructor
+            .newInstance(new Object[] { pDescriptor, this.conf });
         plugin.startUp();
         fActivatedPlugins.put(pDescriptor.getPluginId(), plugin);
         return plugin;
@@ -336,14 +347,14 @@ public Class getCachedClass(PluginDescriptor pDescriptor, String className)
   }
 
   private void displayStatus() {
-    LOG.info("Plugin Auto-activation mode: [" + this.auto + "]");
+    LOG.info("Plugin Auto-activation mode: [{}]", this.auto);
     LOG.info("Registered Plugins:");
 
     if ((fRegisteredPlugins == null) || (fRegisteredPlugins.size() == 0)) {
       LOG.info("\tNONE");
     } else {
       for (PluginDescriptor plugin : fRegisteredPlugins) {
-        LOG.info("\t" + plugin.getName() + " (" + plugin.getPluginId() + ")");
+        LOG.info("\t{} ({})", plugin.getName(), plugin.getPluginId());
       }
     }
 
@@ -352,7 +363,7 @@ private void displayStatus() {
       LOG.info("\tNONE");
     } else {
       for (ExtensionPoint ep : fExtensionPoints.values()) {
-        LOG.info("\t" + ep.getName() + " (" + ep.getId() + ")");
+        LOG.info("\t ({})", ep.getName(), ep.getId());
       }
     }
   }
@@ -388,11 +399,11 @@ private void displayStatus() {
       }
 
       if (!includes.matcher(id).matches()) {
-        LOG.debug("not including: " + id);
+        LOG.debug("not including: {}", id);
         continue;
       }
       if (excludes.matcher(id).matches()) {
-        LOG.debug("excluding: " + id);
+        LOG.debug("excluding: {}", id);
         continue;
       }
       map.put(plugin.getPluginId(), plugin);
@@ -431,8 +442,8 @@ private void displayStatus() {
       }
 
       try {
-        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
-            xPointId);
+        ExtensionPoint point = PluginRepository.get(conf)
+            .getExtensionPoint(xPointId);
         if (point == null)
           throw new RuntimeException(xPointId + " not found.");
         Extension[] extensions = point.getExtensions();
@@ -450,9 +461,8 @@ private void displayStatus() {
         for (String orderedFilter : orderOfFilters) {
           Object f = filterMap.get(orderedFilter);
           if (f == null) {
-            LOG.error(clazz.getSimpleName() + " : " + orderedFilter
-                + " declared in configuration property " + orderProperty
-                + " but not found in an active plugin - ignoring.");
+            LOG.error("{} : {} declared in configuration property {} but not found in an
active plugin - ignoring.", clazz.getSimpleName(), orderedFilter
+                , orderProperty);
             continue;
           }
           sorted.add(f);
@@ -461,8 +471,8 @@ private void displayStatus() {
         for (int i = 0; i < sorted.size(); i++) {
           filter[i] = sorted.get(i);
           if (LOG.isTraceEnabled()) {
-            LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = "
-                + filter[i].getClass());
+            LOG.trace("{} : filters[{}] = {}", clazz.getSimpleName() , i,
+                filter[i].getClass());
           }
         }
         objectCache.setObject(clazz.getName(), filter);
@@ -487,8 +497,8 @@ private void displayStatus() {
    */
   public static void main(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err
-          .println("Usage: PluginRepository pluginId className [arg1 arg2 ...]");
+      System.err.println(
+          "Usage: PluginRepository pluginId className [arg1 arg2 ...]");
       return;
     }
     Configuration conf = NutchConfiguration.create();
@@ -505,8 +515,8 @@ public static void main(String[] args) throws Exception {
     try {
       clazz = Class.forName(args[1], true, cl);
     } catch (Exception e) {
-      System.err.println("Could not load the class '" + args[1] + ": "
-          + e.getMessage());
+      System.err.println(
+          "Could not load the class '" + args[1] + ": " + e.getMessage());
       return;
     }
     Method m = null;
@@ -521,4 +531,103 @@ public static void main(String[] args) throws Exception {
     System.arraycopy(args, 2, subargs, 0, subargs.length);
     m.invoke(null, new Object[] { subargs });
   }
+
+  /**
+   * Registers this PluginRepository to be invoked whenever URLs have to be
+   * parsed. This allows to check the registered protocol plugins for uncommon
+   * protocols.
+   */
+  private void registerURLStreamHandlerFactory() {
+    org.apache.nutch.plugin.URLStreamHandlerFactory.getInstance().registerPluginRepository(this);
+  }
+
+  /**
+   * Invoked whenever a java.net.URL needs to be instantiated. Tries to find a
+   * suitable extension and allow it to provide a URLStreamHandler. This is done
+   * by several attempts:
+   * <ul>
+   * <li>Find a protocol plugin that implements the desired protocol. If found,
+   * instantiate it so eventually the plugin can install a URLStreamHandler
+   * through a static hook.</li>
+   * <li>If the plugin specifies a URLStreamHandler in its <tt>plugin.xml</tt>,
+   * return an instance of this URLStreamHandler. Example:
+   * 
+   * <pre>
+   *  ...
+   *  &lt;implementation id="org.apache.nutch.protocol.foo.Foo" class="org.apache.nutch.protocol.foo.Foo"&gt;
+   *      &lt;parameter name="protocolName" value="foo"/&gt;
+   *      &lt;parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/&gt;
+   *  &lt;/implementation&gt;
+   *  ...
+   * </pre>
+   * 
+   * </li>
+   * <li>if all else fails, return null. This will fallback to the JVM's method
+   * of evaluating the system property <tt>java.protocol.handler.pkgs</tt>.</li>
+   * </ul>
+   * 
+   * @return the URLStreamHandler found, or null.
+   * @see java.net.URL
+   */
+  public URLStreamHandler createURLStreamHandler(String protocol) {
+    LOG.debug("createURLStreamHandler({})", protocol);
+
+    if (fExtensionPoints != null) {
+      ExtensionPoint ep = fExtensionPoints
+          .get("org.apache.nutch.protocol.Protocol");
+      if (ep != null) {
+        Extension[] extensions = ep.getExtensions();
+        for (Extension extension : extensions) {
+          String p = extension.getAttribute("protocolName");
+          LOG.trace("Found {}", p);
+          if (p.equals(protocol)) {
+            LOG.debug("suitable {}", p);
+
+            // instantiate the plugin. This allows it to execute a static hook,
+            // if present. Extensions and PluginInstances are cached already, so we
+            // should not create too many instances
+            Object extinst = null;
+            try {
+              extinst = extension.getExtensionInstance();
+              LOG.debug("found {}", extinst.getClass().getName());
+            } catch (Exception e) {
+              LOG.warn("Could not find {}", extension.getId(), e);
+            }
+
+            // return the handler here, if possible
+            String handlerClass = extension.getAttribute("urlStreamHandler");
+            LOG.debug("urlStreamHandler={}", handlerClass);
+            if (handlerClass != null) {
+              // instantiate the handler and return it
+              ClassLoader cl = this.getClass().getClassLoader(); // the nutch
+                                                                 // classloader
+              LOG.trace("Using nutch classloader {}", cl);
+              if (extinst != null) {
+                cl = extinst.getClass().getClassLoader(); // the extension's
+                                                          // classloader
+                LOG.trace("Using extension classloader {}", cl);
+              }
+
+              try {
+                Class clazz = cl.loadClass(handlerClass);
+                return (URLStreamHandler) clazz.newInstance();
+              } catch (Exception e) {
+                LOG.error("Could not instantiate protocol {} handler class {} defined by
extension {}", protocol, handlerClass, extension.getId(), e);
+                return null;
+              }
+            }
+
+            LOG.debug(
+                "suitable protocol extension found that did not declare a handler");
+            return null;
+          }
+        }
+        LOG.debug("No suitable protocol extensions registered");
+      } else {
+        LOG.debug("No protocol extensions registered?");
+      }
+    }
+
+    return null;
+  }
 }
diff --git a/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
new file mode 100644
index 000000000..7b05d5885
--- /dev/null
+++ b/src/java/org/apache/nutch/plugin/URLStreamHandlerFactory.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.plugin;
+
+import java.lang.ref.WeakReference;
+import java.net.URL;
+import java.net.URLStreamHandler;
+import java.util.ArrayList;
+
+import org.mortbay.log.Log;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This URLStreamHandlerFactory knows about all the plugins
+ * in use and thus can create the correct URLStreamHandler
+ * even if it comes from a plugin classpath.
+ * As the JVM allows only one instance of URLStreamHandlerFactory
+ * to be registered, this class implements a singleton pattern.
+ * @author Hiran Chaudhuri
+ *
+ */
+public class URLStreamHandlerFactory
+    implements java.net.URLStreamHandlerFactory {
+  
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(URLStreamHandlerFactory.class);
+  
+  /** The singleton instance. */
+  private static URLStreamHandlerFactory instance;
+  
+  /** Here we register all PluginRepositories.
+   * In this class we do not know why several instances of PluginRepository
+   * are kept, nor do we know how long they will be used. To prevent
+   * a memory leak, this class must not keep references to PluginRepository
+   * but use WeakReference which allows PluginRepository to still be
+   * garbage collected. The prize is we need to clean the list for
+   * outdated references which is done in the {@link removeInvalidRefs} method.  
+   */
+  private ArrayList<WeakReference<PluginRepository>> prs;
+  
+  static {
+    instance = new URLStreamHandlerFactory();
+    URL.setURLStreamHandlerFactory(instance);
+    LOG.info("Registered URLStreamHandlerFactory with the JVM.");
+  }
+  
+  private URLStreamHandlerFactory() {
+    LOG.debug("URLStreamHandlerFactory()");
+    prs = new ArrayList<>();
+  }
+
+  /** Return the singleton instance of this class. */
+  public static URLStreamHandlerFactory getInstance() {
+    LOG.debug("getInstance()");
+    return instance;
+  }
+  
+  /** Use this method once a new PluginRepository was created to register it.
+   * 
+   * @param pr The PluginRepository to be registered.
+   */
+  public void registerPluginRepository(PluginRepository pr) {
+    LOG.debug("registerPluginRepository(...)");
+    prs.add(new WeakReference<PluginRepository>(pr));
+    
+    removeInvalidRefs();
+  }
+
+  @Override
+  public URLStreamHandler createURLStreamHandler(String protocol) {
+    LOG.debug("createURLStreamHandler({})", protocol);
+    
+    removeInvalidRefs();
+    
+    // find the 'correct' PluginRepository. For now we simply take the first.
+    // then ask it to return the URLStreamHandler
+
+    for(WeakReference<PluginRepository> ref: prs) {
+      PluginRepository pr = ref.get();
+      if(pr != null) {
+        // found PluginRepository. Let's get the URLStreamHandler...
+        return pr.createURLStreamHandler(protocol);
+      }
+    }
+    return null;
+  }
+
+  /** Maintains the list of PluginRepositories by
+   * removing the references whose referents have been
+   * garbage collected meanwhile.
+   */
+  private void removeInvalidRefs() {
+    LOG.debug("removeInvalidRefs()");
+    ArrayList<WeakReference<PluginRepository>> copy = new ArrayList<>(prs);
+    for(WeakReference<PluginRepository> ref: copy) {
+      if(ref.get() == null) {
+        prs.remove(ref);
+      }
+    }
+    LOG.debug("removed {} references, remaining {}", copy.size()-prs.size(), prs.size());
+  }
+}
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index bb391d8e2..87c29d334 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -26,6 +26,7 @@
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.plugin.PluginRepository;
 
 public abstract class NutchTool extends Configured {
 
@@ -49,6 +50,14 @@ public NutchTool(Configuration conf){
   public NutchTool(){
     super(null);
   }
+  
+  @Override
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+    if(conf != null) {
+      PluginRepository.get(conf);
+    }
+  }
 
   /** Returns relative progress of the tool, a float in range [0,1]. */
   public float getProgress() {
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 5402d036c..f9eb9f2fc 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -65,6 +65,7 @@
     <ant dir="parsefilter-naivebayes" target="deploy"/>
     <ant dir="parsefilter-regex" target="deploy"/>
     <ant dir="protocol-file" target="deploy"/>
+    <ant dir="protocol-foo" target="deploy" />
     <ant dir="protocol-ftp" target="deploy"/>
     <ant dir="protocol-htmlunit" target="deploy" />
     <ant dir="protocol-http" target="deploy"/>
@@ -190,6 +191,7 @@
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="parsefilter-regex" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
+    <ant dir="protocol-foo" target="clean" />
     <ant dir="protocol-ftp" target="clean"/>
     <ant dir="protocol-htmlunit" target="clean" />
     <ant dir="protocol-http" target="clean"/>
diff --git a/src/plugin/protocol-foo/build.xml b/src/plugin/protocol-foo/build.xml
new file mode 100755
index 000000000..240f44864
--- /dev/null
+++ b/src/plugin/protocol-foo/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-foo" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/protocol-foo/ivy.xml b/src/plugin/protocol-foo/ivy.xml
new file mode 100755
index 000000000..1a86d6803
--- /dev/null
+++ b/src/plugin/protocol-foo/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/protocol-foo/plugin.xml b/src/plugin/protocol-foo/plugin.xml
new file mode 100755
index 000000000..850afe33f
--- /dev/null
+++ b/src/plugin/protocol-foo/plugin.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<plugin
+   id="protocol-foo"
+   name="Foo Protocol Example Plug-in"
+   version="1.0.0"
+   provider-name="Hiran Chaudhuri">
+
+   <runtime>
+      <library name="protocol-foo.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.protocol.foo"
+              name="FooProtocol"
+              point="org.apache.nutch.protocol.Protocol">
+
+      <implementation id="org.apache.nutch.protocol.foo.Foo"
+                      class="org.apache.nutch.protocol.foo.Foo">
+        <parameter name="protocolName" value="foo"/>
+		<parameter name="urlStreamHandler" value="org.apache.nutch.protocol.foo.Handler"/>
+      </implementation>
+      
+   </extension>
+
+</plugin>
diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
new file mode 100755
index 000000000..6cc01c11e
--- /dev/null
+++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Foo.java
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.plugin.URLStreamHandlerFactory;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+
+public class Foo implements Protocol {
+  protected static final Logger LOG = LoggerFactory.getLogger(Foo.class);
+
+  private Configuration conf;
+
+  @Override
+  public Configuration getConf() {
+    LOG.debug("getConf()");
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    LOG.debug("setConf(...)");
+    this.conf = conf;
+  }
+
+  /**
+   * This is a dummy implementation only. So what we will do is return this
+   * structure:
+   * 
+   * <pre>
+   * foo://example.com - will contain one directory and one file
+   * foo://example.com/a - directory, will contain two files
+   * foo://example.com/a/aa.txt - text file
+   * foo://example.com/a/ab.txt - text file
+   * foo://example.com/a.txt - text file
+   * </pre>
+   */
+  @Override
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    LOG.debug("getProtocolOutput({}, {})", url, datum);
+
+    try {
+      String urlstr = String.valueOf(url);
+      URL u = new URL(urlstr);
+      URL base = new URL(u, ".");
+      byte[] bytes = new byte[0];
+      String contentType = "foo/something";
+      ProtocolStatus status = ProtocolStatus.STATUS_GONE;
+
+      switch (urlstr) {
+      case "foo://example.com":
+      case "foo://example.com/": {
+        String time = HttpDateFormat.toString(System.currentTimeMillis());
+        contentType = "text/html";
+        StringBuffer sb = new StringBuffer();
+        sb.append("<html><head>");
+        sb.append("<title>Index of /</title></head>\n");
+        sb.append("<body><h1>Index of /</h1><pre>\n");
+        sb.append("<a href='a/" + "'>a/</a>\t"+ time + "\t-\n"); // add directory
+        sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n"); //
add file
+        sb.append("</pre></html></body>");
+        bytes = sb.toString().getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      case "foo://example.com/a/": {
+        String time = HttpDateFormat.toString(System.currentTimeMillis());
+        contentType = "text/html";
+        StringBuffer sb = new StringBuffer();
+        sb.append("<html><head>");
+        sb.append("<title>Index of /a/</title></head>\n");
+        sb.append("<body><h1>Index of /a/</h1><pre>\n");
+        sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n");
// add file
+        sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n");
// add file
+        sb.append("</pre></html></body>");
+        bytes = sb.toString().getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      case "foo://example.com/a.txt":
+      case "foo://example.com/a/aa.txt":
+      case "foo://example.com/a/ab.txt": {
+        contentType = "text/plain";
+        bytes = "In publishing and graphic design, lorem ipsum is a filler text or greeking
commonly used to demonstrate the textual elements of a graphic document or visual presentation.
Replacing meaningful content with placeholder text allows designers to design the form of
the content before the content itself has been produced.".getBytes();
+        status = ProtocolStatus.STATUS_SUCCESS;
+        break;
+      }
+      default:
+        LOG.warn("Unknown url '{}'. This dummy implementation only supports 'foo://example.com'",
url);
+        // all our default values are set for URLs that do not exist.
+        break;
+      }
+
+      Metadata metadata = new Metadata();
+      Content content = new Content(String.valueOf(url), String.valueOf(base),
+          bytes, contentType, metadata, getConf());
+
+      return new ProtocolOutput(content, status);
+    } catch (MalformedURLException mue) {
+      LOG.error("Could not retrieve {}", url);
+      LOG.error("", mue);
+      // clain STATUS_GONE to tell nutch to never ever re-request this URL
+      return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE);
+    }
+  }
+
+  @Override
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    LOG.debug(
+        "getRobotRules({}, {}, {})", url, datum, robotsTxtContent);
+    return RobotRulesParser.EMPTY_RULES;
+  }
+}
diff --git a/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
new file mode 100644
index 000000000..27f18377b
--- /dev/null
+++ b/src/plugin/protocol-foo/src/java/org/apache/nutch/protocol/foo/Handler.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.foo;
+
+import java.net.URL;
+import java.net.URLConnection;
+import java.net.URLStreamHandler;
+
+public class Handler extends URLStreamHandler {
+
+  protected URLConnection openConnection(URL u) {
+    throw new UnsupportedOperationException("not yet implemented");
+  }
+}
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 84aa82368..ae7394139 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -36,6 +36,7 @@
 import crawlercommons.robots.BaseRobotRules;
 
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.List;
 import java.io.IOException;
@@ -142,7 +143,16 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
         } else if (code >= 300 && code < 400) { // handle redirect
           if (redirects == MAX_REDIRECTS)
             throw new FtpException("Too many redirects: " + url);
-          u = new URL(response.getHeader("Location"));
+          
+          String loc = response.getHeader("Location");
+          try {
+            u = new URL(u, loc);
+          }
+          catch(MalformedURLException mue) {
+            LOG.error("Could not create redirectURL for {} with {}", url, loc);
+            throw mue;
+          }
+          
           redirects++;
           if (LOG.isTraceEnabled()) {
             LOG.trace("redirect to " + u);
@@ -152,6 +162,7 @@ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
         }
       }
     } catch (Exception e) {
+      LOG.error("Could not get protocol output for {}", url, e);
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> protocol-ftp to resolve relative URL when following redirects
> -------------------------------------------------------------
>
>                 Key: NUTCH-2451
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2451
>             Project: Nutch
>          Issue Type: Bug
>          Components: protocol
>    Affects Versions: 1.13
>         Environment: Ubuntu 16.04.3 LTS
> OpenJDK 1.8.0_131
> nutch 1.14-SNAPSHOT
> Synology RS816
>            Reporter: Hiran Chaudhuri
>
> I tried running Nutch on my Synology NAS. As SMB protocol is not contained in Nutch,
I turned on FTP service on the NAS and configured Nutch to crawl ftp://nas.
> The experience gives me varying results which seem to point to problems within Nutch.
However this may need further evaluation.
> As some files could not be downloaded and I could not see a good error message I changed
the method org.apache.nutch.protocol.ftp.FTP.getProtocolOutput(Text, CrawlDatum) to not only
return protocol status but send the full exception and stack trace to the logs:
> {{    } catch (Exception e) {
>     	LOG.warn("Could not get {}", url, e);
>       return new ProtocolOutput(null, new ProtocolStatus(e));
>     }
> }}
> With this modification I suddenly see such messages in the logfile:
> {{2017-10-25 22:09:31,865 TRACE org.apache.nutch.protocol.ftp.Ftp - fetching ftp://nas/MediaPC/usr/lib32/gconv/ARMSCII-8.so
> 2017-10-25 22:09:32,147 WARN  org.apache.nutch.protocol.ftp.Ftp - Could not get ftp://nas/MediaPC/usr/lib32/gconv/ARMSCII-8.so
> java.net.MalformedURLException
> 	at java.net.URL.<init>(URL.java:627)
> 	at java.net.URL.<init>(URL.java:490)
> 	at java.net.URL.<init>(URL.java:439)
> 	at org.apache.nutch.protocol.ftp.Ftp.getProtocolOutput(Ftp.java:145)
> 	at org.apache.nutch.fetcher.FetcherThread.run(FetcherThread.java:340)
> Caused by: java.lang.NullPointerException
> }}
> Please mind the URL was not configured from me. Instead it was obtained by crawling my
NAS. Also the URL looks perfectly fine to me. Even if the file did not exist I would not expect
a MalformedURLException to occur. Even more, using Firefox and the same authentication data
on the same URL retrieves the file successfully.
> How come Nutch cannot get the file?



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


Mime
View raw message