nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2478) // is not a valid base URL
Date Sun, 17 Dec 2017 11:34:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2478?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16294101#comment-16294101
] 

ASF GitHub Bot commented on NUTCH-2478:
---------------------------------------

sebastian-nagel closed pull request #263: NUTCH-2478 parser resolve base url
URL: https://github.com/apache/nutch/pull/263
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index e93477a43..b4f0eac82 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -31,7 +31,9 @@
 import javax.xml.transform.stream.StreamResult;
 
 import org.apache.xerces.parsers.DOMParser;
+import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
@@ -103,4 +105,11 @@ public static void saveDom(OutputStream os, Element e) {
       LOG.error("Error: ", ex);
     }
   }
+
+  public static void saveDom(OutputStream os, DocumentFragment doc) {
+    NodeList docChildren = doc.getChildNodes();
+    for (int i = 0; i < docChildren.getLength(); i++) {
+      saveDom(os, (Element) docChildren.item(i));
+    }
+  }
 }
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 4527dd7b4..1f1061d39 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -254,7 +254,7 @@ public boolean getTitle(StringBuffer sb, Node node) {
   }
 
   /** If Node contains a BASE tag then it's HREF is returned. */
-  public URL getBase(Node node) {
+  public String getBase(Node node) {
 
     NodeWalker walker = new NodeWalker(node);
 
@@ -276,10 +276,7 @@ public URL getBase(Node node) {
           for (int i = 0; i < attrs.getLength(); i++) {
             Node attr = attrs.item(i);
             if ("href".equalsIgnoreCase(attr.getNodeName())) {
-              try {
-                return new URL(attr.getNodeValue());
-              } catch (MalformedURLException e) {
-              }
+              return attr.getNodeValue();
             }
           }
         }
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 7f60939ae..9ed9fa4ee 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -207,11 +207,19 @@ public ParseResult getParse(Content content) {
 
     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
-      URL baseTag = utils.getBase(root);
+      URL baseTag = base;
+      String baseTagHref = utils.getBase(root);
+      if (baseTagHref != null) {
+        try {
+          baseTag = new URL(base, baseTagHref);
+        } catch (MalformedURLException e) {
+          baseTag = base;
+        }
+      }
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links...");
       }
-      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      utils.getOutlinks(baseTag, l, root);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
         LOG.trace("found " + outlinks.length + " outlinks in "
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index 0b3920667..a4c820674 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -19,10 +19,12 @@
 
 import java.lang.invoke.MethodHandles;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.html.HtmlParser;
+import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
@@ -78,17 +80,26 @@
       { "HTML5, utf-16, BOM", "utf-16",
           "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent
} };
 
+  private static final String resolveBaseUrlTestContent = //
+      "<html>\n<head>\n" + //
+      "  <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
+      "  <base href=\"//www.example.com/\">\n" + //
+      "</head>\n<body>\n" + //
+      "  <a href=\"index.html\">outlink</a>\n" + //
+      "</body>\n</html>";
+
   private Configuration conf;
   private Parser parser;
 
   public TestHtmlParser() {
     conf = NutchConfiguration.create();
+    conf.set("plugin.includes", "parse-html");
     parser = new HtmlParser();
     parser.setConf(conf);
   }
 
   protected Parse parse(byte[] contentBytes) {
-    String dummyUrl = "http://dummy.url/";
+    String dummyUrl = "http://example.com/";
     return parser.getParse(
         new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
             new Metadata(), conf)).get(dummyUrl);
@@ -120,4 +131,17 @@ public void testEncodingDetection() {
     }
   }
 
+  @Test
+  public void testResolveBaseUrl() {
+    byte[] contentBytes = resolveBaseUrlTestContent
+        .getBytes(StandardCharsets.UTF_8);
+    // parse using http://example.com/ as "fetch" URL
+    Parse parse = parse(contentBytes);
+    LOG.info(parse.getData().toString());
+    Outlink[] outlinks = parse.getData().getOutlinks();
+    Assert.assertEquals(1, outlinks.length);
+    Assert.assertEquals("http://www.example.com/index.html",
+        outlinks[0].getToUrl());
+  }
+
 }
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index af85480bc..d40958912 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -259,7 +259,7 @@ public boolean getTitle(StringBuffer sb, Node node) {
   }
 
   /** If Node contains a BASE tag then it's HREF is returned. */
-  URL getBase(Node node) {
+  public String getBase(Node node) {
 
     NodeWalker walker = new NodeWalker(node);
 
@@ -281,10 +281,7 @@ URL getBase(Node node) {
           for (int i = 0; i < attrs.getLength(); i++) {
             Node attr = attrs.item(i);
             if ("href".equalsIgnoreCase(attr.getNodeName())) {
-              try {
-                return new URL(attr.getNodeValue());
-              } catch (MalformedURLException e) {
-              }
+              return attr.getNodeValue();
             }
           }
         }
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 73cd083bc..ea864bec2 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -52,6 +52,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
 import org.xml.sax.ContentHandler;
 
 /**
@@ -170,16 +171,24 @@ public ParseResult getParse(Content content) {
 
     if (!metaTags.getNoFollow()) { // okay to follow links
       ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
-      URL baseTag = utils.getBase(root);
+      URL baseTag = base;
+      String baseTagHref = tikamd.get("Content-Location");
+      if (baseTagHref != null) {
+        try {
+          baseTag = new URL(base, baseTagHref);
+        } catch (MalformedURLException e) {
+          LOG.trace("Invalid <base href=\"{}\">", baseTagHref);
+        }
+      }
       if (LOG.isTraceEnabled()) {
-        LOG.trace("Getting links...");
+        LOG.trace("Getting links (base URL = {}) ...", baseTag);
       }
       
       // pre-1233 outlink extraction
       //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
       // Get outlinks from Tika
       List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
-      utils.getOutlinks(baseTag != null ? baseTag : base, l, tikaExtractedOutlinks);
+      utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
         LOG.trace("found " + outlinks.length + " outlinks in "
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
new file mode 100644
index 000000000..d2bc816e1
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestHtmlParser {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private static final String encodingTestKeywords = "français, español, русский
язык, čeština, ελληνικά";
+  private static final String encodingTestBody = "<ul>\n  <li>français\n  <li>español\n
 <li>русский язык\n  <li>čeština\n  <li>ελληνικά\n</ul>";
+  private static final String encodingTestContent = "<title>"
+      + encodingTestKeywords + "</title>\n"
+      + "<meta name=\"keywords\" content=\"" + encodingTestKeywords + "\" />\n"
+      + "</head>\n<body>" + encodingTestBody + "</body>\n</html>";
+
+  private static String[][] encodingTestPages = {
+      {
+          "HTML4, utf-8, meta http-equiv, no quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=Content-Type content=\"text/html; charset=utf-8\" />"
+              + encodingTestContent },
+      {
+          "HTML4, utf-8, meta http-equiv, single quotes",
+          "utf-8",
+          "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" "
+              + "\"http://www.w3.org/TR/html4/loose.dtd\">\n"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"
+              + encodingTestContent },
+      {
+          "XHTML, utf-8, meta http-equiv, double quotes",
+          "utf-8",
+          "<?xml version=\"1.0\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+              + "<html>\n<head>\n"
+              + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"
/>"
+              + encodingTestContent },
+      {
+          "HTML5, utf-8, meta charset",
+          "utf-8",
+          "<!DOCTYPE html>\n<html>\n<head>\n" + "<meta charset=\"utf-8\">"
+              + encodingTestContent },
+      { "HTML5, utf-8, BOM", "utf-8",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent
},
+      { "HTML5, utf-16, BOM", "utf-16",
+          "\ufeff<!DOCTYPE html>\n<html>\n<head>\n" + encodingTestContent
} };
+
+  private static final String resolveBaseUrlTestContent = //
+      "<html>\n<head>\n" + //
+      "  <title>Test Resolve Base URLs (NUTCH-2478)</title>\n" + //
+      "  <base href=\"//www.example.com/\">\n" + //
+      "</head>\n<body>\n" + //
+      "  <a href=\"index.html\">outlink</a>\n" + //
+      "</body>\n</html>";
+
+  private Configuration conf;
+  private Parser parser;
+
+  public TestHtmlParser() {
+    conf = NutchConfiguration.create();
+    conf.set("plugin.includes", "parse-tika");
+    parser = new TikaParser();
+    parser.setConf(conf);
+  }
+
+  protected Parse parse(byte[] contentBytes) {
+    String dummyUrl = "http://example.com/";
+    return parser.getParse(
+        new Content(dummyUrl, dummyUrl, contentBytes, "text/html",
+            new Metadata(), conf)).get(dummyUrl);
+  }
+
+  @Test
+  public void testEncodingDetection() {
+    for (String[] testPage : encodingTestPages) {
+      String name = testPage[0];
+      Charset charset = Charset.forName(testPage[1]);
+      byte[] contentBytes = testPage[2].getBytes(charset);
+      Parse parse = parse(contentBytes);
+      String text = parse.getText();
+      String title = parse.getData().getTitle();
+      String keywords = parse.getData().getMeta("keywords");
+      LOG.info(name);
+      LOG.info("title:\t" + title);
+      LOG.info("keywords:\t" + keywords);
+      LOG.info("text:\t" + text);
+      Assert.assertEquals("Title not extracted properly (" + name + ")",
+          encodingTestKeywords, title);
+      for (String keyword : encodingTestKeywords.split(",\\s*")) {
+        Assert.assertTrue(keyword + " not found in text (" + name + ")",
+            text.contains(keyword));
+      }
+      Assert.assertNotNull("No keywords extracted", keywords);
+      Assert.assertEquals("Keywords not extracted properly (" + name + ")",
+          encodingTestKeywords, keywords);
+    }
+  }
+
+  @Test
+  public void testResolveBaseUrl() {
+    byte[] contentBytes = resolveBaseUrlTestContent
+        .getBytes(StandardCharsets.UTF_8);
+    // parse using http://example.com/ as "fetch" URL
+    Parse parse = parse(contentBytes);
+    LOG.info(parse.getData().toString());
+    Outlink[] outlinks = parse.getData().getOutlinks();
+    Assert.assertEquals(1, outlinks.length);
+    Assert.assertEquals("http://www.example.com/index.html",
+        outlinks[0].getToUrl());
+  }
+
+}


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> // is not a valid base URL
> --------------------------
>
>                 Key: NUTCH-2478
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2478
>             Project: Nutch
>          Issue Type: Bug
>    Affects Versions: 1.13
>            Reporter: Markus Jelsma
>            Assignee: Markus Jelsma
>             Fix For: 1.14
>
>
> This test fails:
> {code}
>   @Test
>   public void testBadResolver() throws Exception {
>     URL base = new URL("//www.example.org/");
>     String target = "index/produkt/kanaly/";
>     
>     URL abs = URLUtil.resolveURL(base, target);
>     Assert.assertEquals("http://www.example.org/index/produkt/kanaly/", abs.toString());
>   }
> {code}
> and has to fail because of invalid base URL, so the current URL is used. If current URL
is not /, its path will be prepended, resulting in 404 being crawled.
> This ticket must allow // as base, and resolve the protocol.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Mime
View raw message