nutch-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (NUTCH-2584) Upgrade parse-tika to use Tika 1.18
Date Sat, 02 Jun 2018 11:36:00 GMT

    [ https://issues.apache.org/jira/browse/NUTCH-2584?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16499015#comment-16499015
] 

ASF GitHub Bot commented on NUTCH-2584:
---------------------------------------

sebastian-nagel closed pull request #336: NUTCH-2584 Upgrade parse-tika to use Tika 1.18
URL: https://github.com/apache/nutch/pull/336
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 2dbe58351..eb29c9ddb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -34,8 +34,8 @@
 	</publications>
 	
 	<dependencies>
-		<dependency org="org.slf4j" name="slf4j-api" rev="1.6.1" conf="*->master" />
-		<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1" conf="*->master" />
+		<dependency org="org.slf4j" name="slf4j-api" rev="1.7.25" conf="*->master" />
+		<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.7.25" conf="*->master" />
 		
 		<!--dependency org="log4j" name="log4j" rev="1.2.15" conf="*->default">
 			<exclude org="javax.jms" name="jms" />
@@ -43,14 +43,14 @@
 			<exclude org="com.sun.jmx" name="jmxri" />
 		</dependency-->
 		
-		<dependency org="commons-lang" name="commons-lang" rev="2.6" conf="*->default" />
-		<dependency org="commons-collections" name="commons-collections" rev="3.2.1" conf="*->master"
/>
-		<dependency org="commons-httpclient" name="commons-httpclient" rev="3.1" conf="*->master"
/>
-		<dependency org="commons-codec" name="commons-codec" rev="1.10" conf="*->default"
/>
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.14" conf="*->default"
/>
+		<dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default"
/>
+		<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master"
/>
+		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master"
/>
+		<dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default"
/>
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default"
/>
 		<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.2" />
-
+		
 		<!-- Hadoop Dependencies -->
 		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.4" conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />
@@ -65,14 +65,14 @@
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4"
conf="*->default"/>
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.17" />
-		<dependency org="com.ibm.icu" name="icu4j" rev="55.1" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.18" />
+		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
 
 		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
 		<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
 		<dependency org="oro" name="oro" rev="2.0.8" />
 
-		<dependency org="com.google.guava" name="guava" rev="18.0" />
+		<dependency org="com.google.guava" name="guava" rev="25.0-jre" />
 
 		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.9">
 			<exclude org="org.apache.tika"/>
@@ -81,14 +81,14 @@
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
 		
         <!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
-        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.0.4" conf="*->default"/>
-        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.0.4" conf="*->default"/>
-        <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.0.4" conf="*->default"/>
-        <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.0.4"
conf="*->default"/>
-        <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.0.4" conf="test->default"/>
-        <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.5.1"
 conf="*->default"/> 
-        <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor"
rev="2.5.1" conf="*->default"/>
-        <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider"
rev="2.5.1" conf="*->default"/>	
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15"
conf="*->default"/>
+        <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/>
+        <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5"
 conf="*->default"/> 
+        <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor"
rev="2.9.5" conf="*->default"/>
+        <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider"
rev="2.9.5" conf="*->default"/>	
         
 		<!-- WARC artifacts needed  -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
@@ -115,33 +115,34 @@
 
 		<!-- web app dependencies -->
 
-    	<dependency org="org.apache.commons" name="commons-collections4" rev="4.0" conf="*->default"
/>
-    	<dependency org="org.springframework" name="spring-core" rev="4.0.4.RELEASE" conf="*->default"
/>
-    	<dependency org="org.springframework" name="spring-context" rev="4.0.4.RELEASE" conf="*->default"
/>
-    	<dependency org="org.springframework" name="spring-web" rev="4.0.4.RELEASE" conf="*->default"
/>
+    	<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->default"
/>
+    	<dependency org="org.springframework" name="spring-core" rev="4.0.9.RELEASE" conf="*->default"
/>
+    	<dependency org="org.springframework" name="spring-context" rev="4.0.9.RELEASE" conf="*->default"
/>
+    	<dependency org="org.springframework" name="spring-web" rev="4.0.9.RELEASE" conf="*->default"
/>
 
-    	<dependency org="com.sun.jersey" name="jersey-client" rev="1.8" conf="*->default"
/>
+    	<dependency org="com.sun.jersey" name="jersey-client" rev="1.19.4" conf="*->default"
/>
 	
-    	<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="4.48" conf="*->default"
/>
-    	<dependency org="com.h2database" name="h2" rev="1.4.180" conf="*->default" />
-    	<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.0.0" conf="*->default"
/>
+    	<dependency org="com.j256.ormlite" name="ormlite-jdbc" rev="5.1" conf="*->default"
/>
+    	<dependency org="com.h2database" name="h2" rev="1.4.197" conf="*->default" />
+    	<dependency org="org.eclipse.persistence" name="javax.persistence" rev="2.2.0" conf="*->default"
/>
 	
-    	<dependency org="org.apache.wicket" name="wicket-core" rev="6.16.0" conf="*->default"
/>
-    	<dependency org="org.apache.wicket" name="wicket-spring" rev="6.16.0" conf="*->default"
/>
+    	<dependency org="org.apache.wicket" name="wicket-core" rev="6.17.0" conf="*->default"
/>
+    	<dependency org="org.apache.wicket" name="wicket-spring" rev="6.17.0" conf="*->default"
/>
     	<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-core" rev="0.9.2"
conf="*->default" />
 		<dependency org="de.agilecoders.wicket" name="wicket-bootstrap-extensions" rev="0.9.2"
conf="*->default">
 			<exclude org="org.json"/>
 		</dependency>
+
 		
 		<!-- RabbitMQ dependencies -->
-		<dependency org="com.rabbitmq" name="amqp-client" rev="3.6.5" conf="*->default" />
+		<dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />
 
 
 		<!--Added Because of Elasticsearch JEST client-->
 		<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve
correctly-->
-		<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.4"/>
-		<dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.4"/>
-		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.2"/>
+		<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9"/>
+		<dependency org="org.apache.httpcomponents" name="httpcore" rev="4.4.9"/>
+		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5"/>
 
 		<!--global exclusion -->
 		<exclude module="jmxtools" />
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index c759d5f8a..616f4c901 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -235,6 +235,10 @@
     <ivy:retrieve pattern="${build.test.lib}/[artifact]-[revision].[ext]" symlink="false"
log="quiet"/>
   </target>
 
+  <target name="report" depends="resolve-test" description="--> generates a report
of dependencies">
+    <ivy:report todir="${build.dir}"/>
+  </target>
+
   <!-- ================================================================== -->
   <!-- Clean.  Delete the build files, and their directories              -->
   <!-- ================================================================== -->
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index 4ecb3f878..bda9e893b 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -19,23 +19,10 @@
 
   <import file="../build-plugin.xml"/>
   
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-nekohtml" />
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-nekohtml/*.jar" />
-    </fileset>
-  </path>
-  
-    <!-- Deploy Unit test dependencies -->
+  <!-- Deploy Unit test dependencies -->
   <target name="deps-test">
     <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
     <ant target="deploy" inheritall="false" dir="../protocol-file"/>
-    <ant target="deploy" inheritall="false" dir="../lib-nekohtml" />
   </target>
 
   <!-- for junit test -->
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index 63a05a4bf..f8bbae110 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -1,8 +1,18 @@
-1. Upgrade Tika depencency in trunk/ivy/ivy.xml
+1. Upgrade Tika depencency (tika-core) in ivy/ivy.xml
 
 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
 
 3. Upgrade Tika's own dependencies in src/plugin/parse-tika/plugin.xml
+
    To get the list of dependencies and their versions execute:
-   $ ant -f ./build-ivy.xml
-   $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'
+    $ cd src/plugin/parse-tika/
+    $ ant -f ./build-ivy.xml
+    $ ls lib | sed 's/^/      <library name="/g' | sed 's/$/"\/>/g'
+
+   In the plugin.xml replace all lines between
+      <!-- dependencies of Tika (tika-parsers) -->
+   and
+      <!-- end of dependencies of Tika (tika-parsers) -->
+   with the output of the command above.
+
+
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 24ad25b4e..81e7a8038 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.17" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.18" conf="*->default">
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index b9055e415..398c0e423 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -25,6 +25,8 @@
       <library name="parse-tika.jar">
          <export name="*"/>
       </library>
+      <!-- dependencies of Tika (tika-parsers) -->
+      <library name="aopalliance-1.0.jar"/>
       <library name="apache-mime4j-core-0.8.1.jar"/>
       <library name="apache-mime4j-dom-0.8.1.jar"/>
       <library name="asm-5.0.4.jar"/>
@@ -35,53 +37,61 @@
       <library name="bzip2-0.9.1.jar"/>
       <library name="c3p0-0.9.1.1.jar"/>
       <library name="cdm-4.5.5.jar"/>
-      <library name="commons-codec-1.6.jar"/>
+      <library name="commons-codec-1.10.jar"/>
       <library name="commons-collections4-4.1.jar"/>
-      <library name="commons-compress-1.14.jar"/>
+      <library name="commons-compress-1.16.1.jar"/>
       <library name="commons-csv-1.0.jar"/>
       <library name="commons-exec-1.3.jar"/>
-      <library name="commons-io-2.5.jar"/>
+      <library name="commons-io-2.6.jar"/>
+      <library name="commons-logging-1.1.3.jar"/>
+      <library name="commons-logging-1.2.jar"/>
+      <library name="commons-logging-api-1.1.jar"/>
       <library name="curvesapi-1.04.jar"/>
       <library name="cxf-core-3.0.16.jar"/>
       <library name="cxf-rt-frontend-jaxrs-3.0.16.jar"/>
       <library name="cxf-rt-rs-client-3.0.16.jar"/>
       <library name="cxf-rt-transports-http-3.0.16.jar"/>
+      <library name="dec-0.1.2.jar"/>
       <library name="ehcache-core-2.6.2.jar"/>
-      <library name="fontbox-2.0.8.jar"/>
-      <library name="geoapi-3.0.0.jar"/>
+      <library name="fontbox-2.0.9.jar"/>
+      <library name="geoapi-3.0.1.jar"/>
       <library name="grib-4.5.5.jar"/>
       <library name="gson-2.8.1.jar"/>
       <library name="guava-17.0.jar"/>
       <library name="httpmime-4.5.4.jar"/>
       <library name="httpservices-4.5.5.jar"/>
       <library name="isoparser-1.1.18.jar"/>
-      <library name="jackcess-2.1.8.jar"/>
-      <library name="jackcess-encrypt-2.1.2.jar"/>
-      <library name="jackson-core-2.9.2.jar"/>
+      <library name="jackcess-2.1.10.jar"/>
+      <library name="jackcess-encrypt-2.1.4.jar"/>
+      <library name="jackson-annotations-2.9.5.jar"/>
+      <library name="jackson-core-2.9.5.jar"/>
+      <library name="jackson-databind-2.9.5.jar"/>
+      <library name="jai-imageio-core-1.3.1.jar"/>
       <library name="java-libpst-0.8.1.jar"/>
       <library name="javax.annotation-api-1.2.jar"/>
       <library name="javax.ws.rs-api-2.0.1.jar"/>
+      <library name="jbig2-imageio-3.0.0.jar"/>
       <library name="jcip-annotations-1.0.jar"/>
       <library name="jcl-over-slf4j-1.7.24.jar"/>
       <library name="jcommander-1.35.jar"/>
-      <library name="jdom2-2.0.4.jar"/>
+      <library name="jdom2-2.0.6.jar"/>
       <library name="jempbox-1.8.13.jar"/>
       <library name="jhighlight-1.0.2.jar"/>
       <library name="jmatio-1.2.jar"/>
       <library name="jna-4.1.0.jar"/>
       <library name="joda-time-2.2.jar"/>
-      <library name="json-1.8.jar"/>
       <library name="json-simple-1.1.1.jar"/>
-      <library name="jsoup-1.7.2.jar"/>
-      <library name="jsr-275-0.9.3.jar"/>
+      <library name="jsoup-1.11.2.jar"/>
       <library name="jul-to-slf4j-1.7.24.jar"/>
       <library name="juniversalchardet-1.0.3.jar"/>
       <library name="junrar-0.7.jar"/>
       <library name="metadata-extractor-2.10.1.jar"/>
       <library name="netcdf4-4.5.5.jar"/>
-      <library name="opennlp-tools-1.8.3.jar"/>
-      <library name="pdfbox-2.0.8.jar"/>
-      <library name="pdfbox-tools-2.0.8.jar"/>
+      <library name="objenesis-2.6.jar"/>
+      <library name="openjson-1.0.10.jar"/>
+      <library name="opennlp-tools-1.8.4.jar"/>
+      <library name="pdfbox-2.0.9.jar"/>
+      <library name="pdfbox-tools-2.0.9.jar"/>
       <library name="poi-3.17.jar"/>
       <library name="poi-ooxml-3.17.jar"/>
       <library name="poi-ooxml-schemas-3.17.jar"/>
@@ -90,27 +100,36 @@
       <library name="rome-1.5.1.jar"/>
       <library name="rome-utils-1.5.1.jar"/>
       <library name="sentiment-analysis-parser-0.1.jar"/>
-      <library name="sis-metadata-0.6.jar"/>
-      <library name="sis-netcdf-0.6.jar"/>
-      <library name="sis-referencing-0.6.jar"/>
-      <library name="sis-storage-0.6.jar"/>
-      <library name="sis-utility-0.6.jar"/>
+      <library name="sis-feature-0.8.jar"/>
+      <library name="sis-metadata-0.8.jar"/>
+      <library name="sis-netcdf-0.8.jar"/>
+      <library name="sis-referencing-0.8.jar"/>
+      <library name="sis-storage-0.8.jar"/>
+      <library name="sis-utility-0.8.jar"/>
+      <library name="spring-aop-3.2.16.RELEASE.jar"/>
+      <library name="spring-beans-3.2.16.RELEASE.jar"/>
+      <library name="spring-context-3.2.16.RELEASE.jar"/>
+      <library name="spring-core-3.2.16.RELEASE.jar"/>
+      <library name="spring-expression-3.2.16.RELEASE.jar"/>
       <library name="stax2-api-3.1.4.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.17.jar"/>
+      <library name="tika-parsers-1.18.jar"/>
       <library name="udunits-4.5.5.jar"/>
+      <library name="uimafit-core-2.2.0.jar"/>
+      <library name="uimaj-core-2.9.0.jar"/>
+      <library name="unit-api-1.0.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
       <library name="woodstox-core-asl-4.4.1.jar"/>
       <library name="xmlbeans-2.6.0.jar"/>
       <library name="xmlschema-core-2.2.2.jar"/>
       <library name="xmpcore-5.1.3.jar"/>
-      <library name="xz-1.6.jar"/>
+      <library name="xz-1.8.jar"/>
+      <!-- end of dependencies of Tika (tika-parsers) -->
    </runtime>
 
    <requires>
       <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-nekohtml"/>
    </requires>
 
    <extension point="org.apache.nutch.parse.Parser"
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 8d5baec24..9c96bd5f6 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -17,6 +17,7 @@
 
 package org.apache.nutch.parse.tika;
 
+import java.net.MalformedURLException;
 import java.net.URL;
 
 import org.apache.nutch.parse.HTMLMetaTags;
@@ -113,6 +114,29 @@ private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node
node,
               }
 
             } // end if (name == robots)
+            // meta names added/transformed by Tika
+            else if (name.equals("pragma")) {
+              String content = contentNode.getNodeValue().toLowerCase();
+              if (content.contains("no-cache")) {
+                metaTags.setNoCache();
+              }
+            } else if (name.equals("refresh")) {
+              String content = contentNode.getNodeValue().toLowerCase();
+              setRefresh(metaTags, content, currURL);
+            } else if (name.equals("content-location")) {
+              String urlString = contentNode.getNodeValue();
+              URL url = null;
+              try {
+                if (currURL == null) {
+                  url = new URL(urlString);
+                } else {
+                  url = new URL(currURL, urlString);
+                }
+                metaTags.setBaseHref(url);
+              } catch (MalformedURLException e) {
+                // ignore, base-href not set
+              }
+            }
           }
         }
 
@@ -127,54 +151,7 @@ private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node
node,
               if (index >= 0)
                 metaTags.setNoCache();
             } else if ("refresh".equals(name)) {
-              int idx = content.indexOf(';');
-              String time = null;
-              if (idx == -1) { // just the refresh time
-                time = content;
-              } else
-                time = content.substring(0, idx);
-              try {
-                metaTags.setRefreshTime(Integer.parseInt(time));
-                // skip this if we couldn't parse the time
-                metaTags.setRefresh(true);
-              } catch (Exception e) {
-                ;
-              }
-              URL refreshUrl = null;
-              if (metaTags.getRefresh() && idx != -1) { // set the URL
-                idx = content.toLowerCase().indexOf("url=");
-                if (idx == -1) { // assume a mis-formatted entry with just the
-                                 // url
-                  idx = content.indexOf(';') + 1;
-                } else
-                  idx += 4;
-                if (idx != -1) {
-                  String url = content.substring(idx);
-                  try {
-                    refreshUrl = new URL(url);
-                  } catch (Exception e) {
-                    // XXX according to the spec, this has to be an absolute
-                    // XXX url. However, many websites use relative URLs and
-                    // XXX expect browsers to handle that.
-                    // XXX Unfortunately, in some cases this may create a
-                    // XXX infinitely recursive paths (a crawler trap)...
-                    // if (!url.startsWith("/")) url = "/" + url;
-                    try {
-                      refreshUrl = new URL(currURL, url);
-                    } catch (Exception e1) {
-                      refreshUrl = null;
-                    }
-                  }
-                }
-              }
-              if (metaTags.getRefresh()) {
-                if (refreshUrl == null) {
-                  // apparently only refresh time was present. set the URL
-                  // to the same URL.
-                  refreshUrl = currURL;
-                }
-                metaTags.setRefreshHref(refreshUrl);
-              }
+              setRefresh(metaTags, content, currURL);
             }
           }
         }
@@ -213,4 +190,56 @@ private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node
node,
     }
   }
 
+  private static void setRefresh(HTMLMetaTags metaTags, String content,
+      URL currURL) {
+    int idx = content.indexOf(';');
+    String time = null;
+    if (idx == -1) { // just the refresh time
+      time = content;
+    } else
+      time = content.substring(0, idx);
+    try {
+      metaTags.setRefreshTime(Integer.parseInt(time));
+      // skip this if we couldn't parse the time
+      metaTags.setRefresh(true);
+    } catch (Exception e) {
+      ;
+    }
+    URL refreshUrl = null;
+    if (metaTags.getRefresh() && idx != -1) { // set the URL
+      idx = content.toLowerCase().indexOf("url=");
+      if (idx == -1) { // assume a mis-formatted entry with just the
+                       // url
+        idx = content.indexOf(';') + 1;
+      } else
+        idx += 4;
+      if (idx != -1) {
+        String url = content.substring(idx);
+        try {
+          refreshUrl = new URL(url);
+        } catch (Exception e) {
+          // XXX according to the spec, this has to be an absolute
+          // XXX url. However, many websites use relative URLs and
+          // XXX expect browsers to handle that.
+          // XXX Unfortunately, in some cases this may create a
+          // XXX infinitely recursive paths (a crawler trap)...
+          // if (!url.startsWith("/")) url = "/" + url;
+          try {
+            refreshUrl = new URL(currURL, url);
+          } catch (Exception e1) {
+            refreshUrl = null;
+          }
+        }
+      }
+    }
+    if (metaTags.getRefresh()) {
+      if (refreshUrl == null) {
+        // apparently only refresh time was present. set the URL
+        // to the same URL.
+        refreshUrl = currURL;
+      }
+      metaTags.setRefreshHref(refreshUrl);
+    }
+  }
+
 }
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 8c867d8be..40d82bcaf 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -57,8 +57,7 @@
 /**
  * Wrapper for Tika parsers. Mimics the HTMLParser but using the XHTML
  * representation returned by Tika as SAX events
- ***/
-
+ */
 public class TikaParser implements org.apache.nutch.parse.Parser {
 
   private static final Logger LOG = LoggerFactory
@@ -72,8 +71,17 @@
   private HtmlMapper HTMLMapper;
   private boolean upperCaseElementNames = true;
 
-  @SuppressWarnings("deprecation")
   public ParseResult getParse(Content content) {
+    HTMLDocumentImpl doc = new HTMLDocumentImpl();
+    doc.setErrorChecking(false);
+    DocumentFragment root = doc.createDocumentFragment();
+
+    return getParse(content, doc, root);
+  }
+
+  @SuppressWarnings("deprecation")
+  ParseResult getParse(Content content, HTMLDocumentImpl doc,
+      DocumentFragment root) {
     String mimeType = content.getContentType();
     
     boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
@@ -103,10 +111,6 @@ public ParseResult getParse(Content content) {
 
     Metadata tikamd = new Metadata();
 
-    HTMLDocumentImpl doc = new HTMLDocumentImpl();
-    doc.setErrorChecking(false);
-    DocumentFragment root = doc.createDocumentFragment();
-
     ContentHandler domHandler;
     
     // Check whether to use Tika's BoilerplateContentHandler
@@ -266,7 +270,7 @@ public void setConf(Configuration conf) {
     String htmlmapperClassName = conf.get("tika.htmlmapper.classname");
     if (StringUtils.isNotBlank(htmlmapperClassName)) {
       try {
-        Class HTMLMapperClass = Class.forName(htmlmapperClassName);
+        Class<?> HTMLMapperClass = Class.forName(htmlmapperClassName);
         boolean interfaceOK = HtmlMapper.class
             .isAssignableFrom(HTMLMapperClass);
         if (!interfaceOK) {
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
similarity index 89%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
index 2159b9d5a..06828cf7e 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
@@ -15,25 +15,23 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.tika.DOMContentUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-
-import java.io.ByteArrayInputStream;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.StringTokenizer;
 
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
 
 /**
  * Unit tests for DOMContentUtils.
@@ -55,7 +53,7 @@
           + "<a href=\"/\"> separate this " + "<a href=\"ok\"> from this"
           + "</a></a>" + "</body></html>"),
 
-      // this one relies on certain neko fixup behavior, possibly
+      // this one relies on certain  fixup behavior, possibly
       // distributing the anchors into the LI's-but not the other
       // anchors (outside of them, instead)! So you get a tree that
       // looks like:
@@ -112,7 +110,7 @@
           + "<a href=\"http://www.nutch.org\" rel=\"nofollow\"> ignore </a>"
           + "<a rel=\"nofollow\" href=\"http://www.nutch.org\"> ignore </a>"
           + "</body></html>"),
-      // test that POST form actions are skipped
+      // test that all form actions are skipped
       new String("<html><head></head><body>"
           + "<form method='POST' action='/search.jsp'><input type=text>"
           + "<input type=submit><p>test1</p></form>"
@@ -150,20 +148,17 @@
 
   private static URL[] testBaseHrefURLs = new URL[testPages.length];
 
-  private static final String[] answerText = {
-      "title body anchor",
-      "title body home bots",
-      "separate this from this",
-      "my title body home 1 2",
-      "my title",
-      "my title the bottom",
-      "my title Whitespace test whitespace test "
-          + "This is a whitespace test . Newlines should appear as space too. "
-          + "Tabs are spaces too. This is a break -> and the line after break . "
-          + "one two three space here space there no space "
-          + "one two two three three four put some text here and there. "
-          + "End this madness ! . . . .", "ignore ignore", "test1 test2",
-      "test1 test2", "title anchor1 anchor2 anchor3",
+  private static final String[] answerText = { "title body anchor",
+      "title body home bots", "separate this from this",
+      "my title body home 1 2", "my title", "my title the bottom",
+      "my title\n" + "Whitespace test\n" + "whitespace test\n"
+          + "This is a whitespace test. Newlines should appear as space too.\n"
+          + "Tabs are spaces too. This is a break -> and the line after break.\n"
+          + "one\n" + "two\n" + "three\n" + "space here\n" + "space there\n"
+          + "no space\n" + "one two\n" + "two three\n" + "three four\n"
+          + "put some text here and there. End this madness ! . . . .",
+      "ignore ignore", "test1 test2", "test1 test2",
+      "title anchor1 anchor2 anchor3",
       "title anchor1 anchor2 anchor3 anchor4 anchor5", "" };
 
   private static final String[] answerTitle = { "title", "title", "",
@@ -179,23 +174,26 @@
   @Before
   public void setup() throws Exception {
     conf = NutchConfiguration.create();
-    conf.setBoolean("parser.html.form.use_action", true);
     utils = new DOMContentUtils(conf);
-    DOMFragmentParser parser = new DOMFragmentParser();
-    parser.setFeature(
-        "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe",
-        true);
+    conf.set("plugin.includes", "parse-tika");
+    TikaParser parser = new TikaParser();
+    parser.setConf(conf);
+
     for (int i = 0; i < testPages.length; i++) {
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
       try {
-        parser.parse(
-            new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
-            node);
-        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
+        String url = testBaseHrefs[i];
+        testBaseHrefURLs[i] = new URL(url);
+        Content content = new Content(url, url,
+            testPages[i].getBytes(StandardCharsets.UTF_8), "text/html",
+            new Metadata(), conf);
+        HTMLDocumentImpl doc = new HTMLDocumentImpl();
+        doc.setErrorChecking(false);
+        DocumentFragment root = doc.createDocumentFragment();
+        parser.getParse(content, doc, root);
+        testDOMs[i] = root;
       } catch (Exception e) {
         Assert.assertTrue("caught exception: " + e, false);
       }
-      testDOMs[i] = node;
     }
     answerOutlinks = new Outlink[][] {
         { new Outlink("http://www.nutch.org", "anchor"), },
@@ -217,7 +215,7 @@ public void setup() throws Exception {
             new Outlink("http://www.nutch.org/docs/index.html", ""), },
         { new Outlink("http://www.nutch.org/index.html", "whitespace test"), },
         {},
-        { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), },
+        {},
         {},
         { new Outlink("http://www.nutch.org/;x", "anchor1"),
             new Outlink("http://www.nutch.org/g;x", "anchor2"),
@@ -230,7 +228,7 @@ public void setup() throws Exception {
             new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
             new Outlink("http://www.nutch.org/;something?y=1;somethingelse",
                 "anchor5") },
-        { new Outlink("http://www.nutch.org/movie.mp4", "") } };
+        {} };
 
   }
 
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
similarity index 99%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index cdfe2b48c..2677395d8 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import java.lang.invoke.MethodHandles;
 
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
similarity index 99%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index d2bc816e1..f7d01f6fb 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import java.lang.invoke.MethodHandles;
 import java.nio.charset.Charset;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
similarity index 98%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index b1762e603..c688ee492 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
similarity index 98%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 576b3dfdd..a3c04ca00 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
similarity index 98%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index 6960bad66..f9ad710c7 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
similarity index 98%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index 9884f0cf4..c7d7d0abc 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.Protocol;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
similarity index 98%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index f15d821fe..6585d98d6 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
 // Nutch imports
 import org.apache.nutch.crawl.CrawlDatum;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
similarity index 68%
rename from src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
rename to src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
index 4224f934d..8a949a65f 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
@@ -15,20 +15,20 @@
  * limitations under the License.
  */
 
-package org.apache.nutch.tika;
+package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.tika.HTMLMetaProcessor;
-
-import java.io.ByteArrayInputStream;
 import java.net.URL;
 
-import org.xml.sax.*;
-import org.w3c.dom.*;
-import org.apache.html.dom.*;
-import org.cyberneko.html.parsers.DOMFragmentParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.html.dom.HTMLDocumentImpl;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.w3c.dom.DocumentFragment;
 
 /** Unit tests for HTMLMetaProcessor. */
 public class TestRobotsMetaProcessor {
@@ -85,9 +85,13 @@
           + "<base href=\"http://www.nutch.org/base/\">" + "</head><body>"
           + " some text" + "</body></html>",
 
+      "<html><head><title>Meta-refresh redirect</title>"
+          + "<meta http-equiv=\"refresh\" content=\"0; url=http://example.com/\"></head><body>
"
+          + "Test meta-refresh redirect." + "</body></html>",
   };
 
-  public static final boolean[][] answers = { { true, true, true }, // NONE
+  public static final boolean[][] answers = { //
+      { true, true, true }, // NONE
       { false, false, true }, // all
       { true, true, true }, // nOnE
       { true, true, false }, // none
@@ -96,14 +100,16 @@
       { false, true, false }, // index,nofollow
       { false, false, false }, // index,follow
       { false, false, false }, // missing!
+      { false, false, false }, // NUTCH-2589: test for meta-refresh redirects
   };
 
   private URL[][] currURLsAndAnswers;
 
   @Test
   public void testRobotsMetaProcessor() {
-    DOMFragmentParser parser = new DOMFragmentParser();
-    ;
+    Configuration conf = NutchConfiguration.create();
+    TikaParser parser = new TikaParser();
+    parser.setConf(conf);
 
     try {
       currURLsAndAnswers = new URL[][] {
@@ -117,7 +123,8 @@ public void testRobotsMetaProcessor() {
           { new URL("http://www.nutch.org/foo/"),
               new URL("http://www.nutch.org/") },
           { new URL("http://www.nutch.org"),
-              new URL("http://www.nutch.org/base/") } };
+              new URL("http://www.nutch.org/base/") },
+          { new URL("http://www.nutch.org"), null } };
     } catch (Exception e) {
       Assert.assertTrue("couldn't make test URLs!", false);
     }
@@ -125,23 +132,29 @@ public void testRobotsMetaProcessor() {
     for (int i = 0; i < tests.length; i++) {
       byte[] bytes = tests[i].getBytes();
 
-      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();
+      HTMLDocumentImpl doc = new HTMLDocumentImpl();
+      doc.setErrorChecking(false);
+      DocumentFragment root = doc.createDocumentFragment();
+      String url = "http://www.nutch.org";
+      Content content = new Content(url,
+          url, bytes, "text/html", new Metadata(), conf);
+      Parse parse = null;
 
       try {
-        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
+        parse = parser.getParse(content, doc, root).get(url);
       } catch (Exception e) {
         e.printStackTrace();
       }
 
       HTMLMetaTags robotsMeta = new HTMLMetaTags();
-      HTMLMetaProcessor.getMetaTags(robotsMeta, node, currURLsAndAnswers[i][0]);
-
-      Assert.assertTrue("got index wrong on test " + i,
-          robotsMeta.getNoIndex() == answers[i][0]);
-      Assert.assertTrue("got follow wrong on test " + i,
-          robotsMeta.getNoFollow() == answers[i][1]);
-      Assert.assertTrue("got cache wrong on test " + i,
-          robotsMeta.getNoCache() == answers[i][2]);
+      HTMLMetaProcessor.getMetaTags(robotsMeta, root, currURLsAndAnswers[i][0]);
+
+      Assert.assertEquals("got noindex wrong on test " + i,
+          answers[i][0], robotsMeta.getNoIndex());
+      Assert.assertEquals("got nofollow wrong on test " + i,
+          answers[i][1], robotsMeta.getNoFollow());
+      Assert.assertEquals("got nocache wrong on test " + i,
+          answers[i][2], robotsMeta.getNoCache());
       Assert
           .assertTrue(
               "got base href wrong on test " + i + " (got "
@@ -150,6 +163,17 @@ public void testRobotsMetaProcessor() {
                   || ((robotsMeta.getBaseHref() != null) && robotsMeta
                       .getBaseHref().equals(currURLsAndAnswers[i][1])));
 
+      if (tests[i].contains("meta-refresh redirect")) {
+        // test for NUTCH-2589
+        URL metaRefreshUrl = robotsMeta.getRefreshHref();
+        Assert.assertNotNull("failed to get meta-refresh redirect",
+            metaRefreshUrl);
+        Assert.assertEquals("failed to get meta-refresh redirect",
+            "http://example.com/", metaRefreshUrl.toString());
+        Assert.assertEquals(
+            "failed to add meta-refresh redirect to parse status",
+            "http://example.com/", parse.getData().getStatus().getArgs()[0]);
+      }
     }
   }
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Upgrade parse-tika to use Tika 1.18
> -----------------------------------
>
>                 Key: NUTCH-2584
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2584
>             Project: Nutch
>          Issue Type: Improvement
>          Components: parser
>    Affects Versions: 1.14
>            Reporter: Sebastian Nagel
>            Priority: Minor
>             Fix For: 1.15
>
>
> Tika 1.18 is released and NUTCH-2583 includes and upgrade of tika-core.
> See [howto_upgrade_tika|https://github.com/apache/nutch/blob/master/src/plugin/parse-tika/howto_upgrade_tika.txt].




--
This message was sent by Atlassian JIRA
(v7.6.3#76005)


Mime
View raw message