nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sna...@apache.org
Subject [nutch] branch master updated: NUTCH-2682 Upgrade to Tika 1.20 - upgrade to Tika dependencies to version 1.20 - plugin parse-tika: add exclusions of transitive dependencies already provided as Nutch core dependencies - upgrade Nutch core dependencies to match versions required by Tika 1.20 - apply code formatting template to TikaParser class and replace deprecated method calls
Date Mon, 21 Jan 2019 15:36:29 GMT
This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 784aa5f  NUTCH-2682 Upgrade to Tika 1.20 - upgrade to Tika dependencies to version
1.20 - plugin parse-tika: add exclusions of transitive dependencies   already provided as
Nutch core dependencies - upgrade Nutch core dependencies to match versions required   by
Tika 1.20 - apply code formatting template to TikaParser class and replace   deprecated method
calls
     new 6934d52  Merge pull request #424 from sebastian-nagel/NUTCH-2682-upgrade-tika
784aa5f is described below

commit 784aa5f8a5210cdd129a583c1dccdffaad5f9807
Author: Sebastian Nagel <snagel@apache.org>
AuthorDate: Fri Jan 4 17:40:17 2019 +0100

    NUTCH-2682 Upgrade to Tika 1.20
    - upgrade to Tika dependencies to version 1.20
    - plugin parse-tika: add exclusions of transitive dependencies
      already provided as Nutch core dependencies
    - upgrade Nutch core dependencies to match versions required
      by Tika 1.20
    - apply code formatting template to TikaParser class and replace
      deprecated method calls
---
 ivy/ivy.xml                                        | 26 +++----
 src/plugin/parse-tika/howto_upgrade_tika.txt       | 19 ++++++
 src/plugin/parse-tika/ivy.xml                      | 16 ++++-
 src/plugin/parse-tika/plugin.xml                   | 79 +++++++++++-----------
 .../org/apache/nutch/parse/tika/TikaParser.java    | 78 +++++++++++----------
 5 files changed, 129 insertions(+), 89 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index f1e4a80..52826bb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -43,11 +43,11 @@
 			<exclude org="com.sun.jmx" name="jmxri" />
 		</dependency-->
 
-		<dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default"
/>
-		<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master"
/>
-		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master"
/>
+		<dependency org="org.apache.commons" name="commons-lang3" rev="3.8.1" conf="*->default"
/>
+		<dependency org="org.apache.commons" name="commons-collections4" rev="4.2" conf="*->master"
/>
+		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.6" conf="*->master"
/>
 		<dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default"
/>
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default"
/>
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.18" conf="*->default"
/>
 		<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.2" />
 
@@ -65,7 +65,7 @@
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4"
conf="*->default"/>
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.19.1" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.20" />
 		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
 
 		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
@@ -78,14 +78,14 @@
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
 
 		<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/>
-		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.5"
conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.5"
conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7"
conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7"
conf="*->default"/>
 
 		<!-- WARC artifacts needed -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index f8bbae1..fbf7207 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -15,4 +15,23 @@
       <!-- end of dependencies of Tika (tika-parsers) -->
    with the output of the command above.
 
+4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies:
+   - check for libs present both in
+       build/lib
+     and
+       build/plugins/parse-tika/
+     (eventually with different versions)
+   - duplicated libs can be added to the exclusions of transitive dependencies in
+       build/plugins/parse-tika/ivy.xml
+   - but it should be made sure that the library versions in ivy/ivy.xml correspend to
+     those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
+
+    $ rm -rf lib/
+
+6. Build Nutch and run all unit tests:
+
+    $ cd ../../../
+    $ ant clean runtime test
 
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 53c7775..df06f14 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,14 +36,24 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.19.1" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default">
+      <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
-      <exclude org="org.slf4j" name="slf4j-log4j12" />
-      <exclude org="org.slf4j" name="slf4j-api" />
       <exclude org="commons-lang" name="commons-lang" />
+      <exclude org="org.apache.commons" name="commons-lang3" />
+      <exclude org="org.apache.commons" name="commons-codec" />
+      <exclude org="commons-codec" name="commons-codec" /><!-- older versions are
published with org=commons-codec -->
+      <exclude org="org.apache.commons" name="commons-collections4" />
+      <exclude org="org.apache.commons" name="commons-compress" />
+      <exclude org="org.apache.cxf" name="cxf-core" />
+      <exclude org="org.apache.cxf" name="cxf-rt-transports-http" />
+      <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
       <exclude org="com.google.protobuf" name="protobuf-java" />
+      <exclude org="org.slf4j" name="slf4j-log4j12" />
+      <exclude org="org.slf4j" name="slf4j-api" />
     </dependency>
   </dependencies>
   
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 7dbe180..b89f41e 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,10 +26,9 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
-      <library name="activation-1.1.1.jar"/>
       <library name="apache-mime4j-core-0.8.2.jar"/>
       <library name="apache-mime4j-dom-0.8.2.jar"/>
-      <library name="asm-6.2.jar"/>
+      <library name="asm-7.0.jar"/>
       <library name="bcmail-jdk15on-1.60.jar"/>
       <library name="bcpkix-jdk15on-1.60.jar"/>
       <library name="bcprov-jdk15on-1.60.jar"/>
@@ -37,22 +36,22 @@
       <library name="bzip2-0.9.1.jar"/>
       <library name="c3p0-0.9.1.1.jar"/>
       <library name="cdm-4.5.5.jar"/>
-      <library name="commons-codec-1.11.jar"/>
       <library name="commons-collections4-4.2.jar"/>
       <library name="commons-compress-1.18.jar"/>
-      <library name="commons-csv-1.5.jar"/>
+      <library name="commons-csv-1.6.jar"/>
       <library name="commons-exec-1.3.jar"/>
       <library name="commons-io-2.6.jar"/>
-      <library name="commons-logging-1.2.jar"/>
-      <library name="curvesapi-1.04.jar"/>
-      <library name="cxf-core-3.2.6.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.2.6.jar"/>
-      <library name="cxf-rt-rs-client-3.2.6.jar"/>
-      <library name="cxf-rt-transports-http-3.2.6.jar"/>
+      <library name="commons-lang3-3.8.1.jar"/>
+      <library name="commons-math3-3.6.1.jar"/>
+      <library name="curvesapi-1.05.jar"/>
+      <library name="cxf-core-3.2.7.jar"/>
+      <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/>
+      <library name="cxf-rt-rs-client-3.2.7.jar"/>
+      <library name="cxf-rt-transports-http-3.2.7.jar"/>
       <library name="dec-0.1.2.jar"/>
       <library name="ehcache-core-2.6.2.jar"/>
-      <library name="FastInfoset-1.2.13.jar"/>
-      <library name="fontbox-2.0.12.jar"/>
+      <library name="FastInfoset-1.2.15.jar"/>
+      <library name="fontbox-2.0.13.jar"/>
       <library name="geoapi-3.0.1.jar"/>
       <library name="grib-4.5.5.jar"/>
       <library name="gson-2.8.5.jar"/>
@@ -60,19 +59,19 @@
       <library name="httpmime-4.5.6.jar"/>
       <library name="httpservices-4.5.5.jar"/>
       <library name="isoparser-1.1.22.jar"/>
-      <library name="istack-commons-runtime-3.0.5.jar"/>
+      <library name="istack-commons-runtime-3.0.7.jar"/>
       <library name="jackcess-2.1.12.jar"/>
       <library name="jackcess-encrypt-2.1.4.jar"/>
-      <library name="jackson-annotations-2.9.6.jar"/>
-      <library name="jackson-core-2.9.6.jar"/>
-      <library name="jackson-databind-2.9.6.jar"/>
+      <library name="jackson-annotations-2.9.7.jar"/>
+      <library name="jackson-core-2.9.7.jar"/>
+      <library name="jackson-databind-2.9.7.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
       <library name="java-libpst-0.8.1.jar"/>
-      <library name="javax.annotation-api-1.3.jar"/>
-      <library name="javax.ws.rs-api-2.1.jar"/>
-      <library name="jaxb-api-2.3.0.jar"/>
-      <library name="jaxb-core-2.3.0.1.jar"/>
-      <library name="jaxb-runtime-2.3.0.1.jar"/>
+      <library name="javax.activation-1.2.0.jar"/>
+      <library name="javax.annotation-api-1.3.2.jar"/>
+      <library name="javax.ws.rs-api-2.1.1.jar"/>
+      <library name="jaxb-api-2.3.1.jar"/>
+      <library name="jaxb-runtime-2.3.1.jar"/>
       <library name="jbig2-imageio-3.0.2.jar"/>
       <library name="jcip-annotations-1.0.jar"/>
       <library name="jcl-over-slf4j-1.7.25.jar"/>
@@ -81,7 +80,7 @@
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="jna-4.3.0.jar"/>
+      <library name="jna-5.1.0.jar"/>
       <library name="joda-time-2.2.jar"/>
       <library name="json-simple-1.1.1.jar"/>
       <library name="jsoup-1.11.3.jar"/>
@@ -92,16 +91,18 @@
       <library name="netcdf4-4.5.5.jar"/>
       <library name="openjson-1.0.10.jar"/>
       <library name="opennlp-tools-1.9.0.jar"/>
-      <library name="parso-2.0.9.jar"/>
-      <library name="pdfbox-2.0.12.jar"/>
-      <library name="pdfbox-tools-2.0.12.jar"/>
-      <library name="poi-4.0.0.jar"/>
-      <library name="poi-ooxml-4.0.0.jar"/>
-      <library name="poi-ooxml-schemas-4.0.0.jar"/>
-      <library name="poi-scratchpad-4.0.0.jar"/>
+      <library name="parso-2.0.10.jar"/>
+      <library name="pdfbox-2.0.13.jar"/>
+      <library name="pdfbox-tools-2.0.13.jar"/>
+      <library name="poi-4.0.1.jar"/>
+      <library name="poi-ooxml-4.0.1.jar"/>
+      <library name="poi-ooxml-schemas-4.0.1.jar"/>
+      <library name="poi-scratchpad-4.0.1.jar"/>
+      <library name="procyon-compilertools-0.5.32.jar"/>
+      <library name="procyon-core-0.5.32.jar"/>
       <library name="quartz-2.2.0.jar"/>
-      <library name="rome-1.5.1.jar"/>
-      <library name="rome-utils-1.5.1.jar"/>
+      <library name="rome-1.12.0.jar"/>
+      <library name="rome-utils-1.12.0.jar"/>
       <library name="sentiment-analysis-parser-0.1.jar"/>
       <library name="sis-feature-0.8.jar"/>
       <library name="sis-metadata-0.8.jar"/>
@@ -109,19 +110,19 @@
       <library name="sis-referencing-0.8.jar"/>
       <library name="sis-storage-0.8.jar"/>
       <library name="sis-utility-0.8.jar"/>
-      <library name="stax2-api-4.1.jar"/>
-      <library name="stax-ex-1.7.8.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="stax-ex-1.8.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.19.1.jar"/>
-      <library name="txw2-2.3.0.1.jar"/>
+      <library name="tika-parsers-1.20.jar"/>
+      <library name="txw2-2.3.1.jar"/>
       <library name="udunits-4.5.5.jar"/>
-      <library name="uimafit-core-2.2.0.jar"/>
-      <library name="uimaj-core-2.9.0.jar"/>
+      <library name="uimafit-core-2.4.0.jar"/>
+      <library name="uimaj-core-3.0.1.jar"/>
       <library name="unit-api-1.0.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="woodstox-core-5.1.0.jar"/>
-      <library name="xmlbeans-3.0.1.jar"/>
+      <library name="woodstox-core-5.0.3.jar"/>
+      <library name="xmlbeans-3.0.2.jar"/>
       <library name="xmlschema-core-2.2.3.jar"/>
       <library name="xmpcore-5.1.3.jar"/>
       <library name="xz-1.8.jar"/>
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index e346940..7440333 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -42,6 +42,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
@@ -70,6 +71,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
   private String cachingPolicy;
   private HtmlMapper HTMLMapper;
   private boolean upperCaseElementNames = true;
+  private String boilerpipeExtractorName;
+  private boolean useBoilerpipe;
 
   public ParseResult getParse(Content content) {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -83,59 +86,59 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
   ParseResult getParse(Content content, HTMLDocumentImpl doc,
       DocumentFragment root) {
     String mimeType = content.getContentType();
-    
-    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
-    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm",
"ArticleExtractor");
 
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      return new ParseStatus(e)
-          .getEmptyParseResult(content.getUrl(), getConf());
+      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
+          getConf());
     }
 
     // get the right parser using the mime type as a clue
-    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
-    byte[] raw = content.getContent();
-
+    CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
+    Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
     if (parser == null) {
       String message = "Can't retrieve Tika parser for mime-type " + mimeType;
       LOG.error(message);
-      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
-          content.getUrl(), getConf());
+      return new ParseStatus(ParseStatus.FAILED, message)
+          .getEmptyParseResult(content.getUrl(), getConf());
     }
 
-    LOG.debug("Using Tika parser " + parser.getClass().getName()
-        + " for mime-type " + mimeType);
+    LOG.debug("Using Tika parser {} for mime-type {}.",
+        parser.getClass().getName(), mimeType);
 
+    byte[] raw = content.getContent();
     Metadata tikamd = new Metadata();
 
     ContentHandler domHandler;
-    
+
     // Check whether to use Tika's BoilerplateContentHandler
     if (useBoilerpipe) {
-      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new
DOMBuilder(doc, root),
-      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
+          (ContentHandler) new DOMBuilder(doc, root),
+          BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
       bpHandler.setIncludeMarkup(true);
-      domHandler = (ContentHandler)bpHandler;
+      domHandler = (ContentHandler) bpHandler;
     } else {
       DOMBuilder domBuilder = new DOMBuilder(doc, root);
       domBuilder.setUpperCaseElementNames(upperCaseElementNames);
       domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
-      domHandler = (ContentHandler)domBuilder;
+      domHandler = (ContentHandler) domBuilder;
     }
 
     LinkContentHandler linkContentHandler = new LinkContentHandler();
 
     ParseContext context = new ParseContext();
-    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
-    
+    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler,
+        linkContentHandler);
+
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);
     tikamd.set(Metadata.CONTENT_TYPE, mimeType);
     try {
-      parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd,
context);
+      parser.parse(new ByteArrayInputStream(raw),
+          (ContentHandler) teeContentHandler, tikamd, context);
     } catch (Exception e) {
       LOG.error("Error parsing " + content.getUrl(), e);
       return new ParseStatus(ParseStatus.FAILED, e.getMessage())
@@ -186,16 +189,16 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links (base URL = {}) ...", baseTag);
       }
-      
+
       // pre-1233 outlink extraction
-      //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
       // Get outlinks from Tika
       List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
       utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
-        LOG.trace("found " + outlinks.length + " outlinks in "
-            + content.getUrl());
+        LOG.trace(
+            "found " + outlinks.length + " outlinks in " + content.getUrl());
       }
     }
 
@@ -251,7 +254,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
         // see if a Tika config file can be found in the job file
         URL customTikaConfig = conf.getResource(customConfFile);
         if (customTikaConfig != null)
-          tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
+          tikaConfig = new TikaConfig(customTikaConfig,
+              this.getClass().getClassLoader());
       } catch (Exception e1) {
         String message = "Problem loading custom Tika configuration from "
             + customConfFile;
@@ -277,20 +281,26 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
           throw new RuntimeException("Class " + htmlmapperClassName
               + " does not implement HtmlMapper");
         }
-        HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor().newInstance();
+        HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor()
+            .newInstance();
       } catch (Exception e) {
-        LOG.error("Can't generate instance for class " + htmlmapperClassName);
-        throw new RuntimeException("Can't generate instance for class "
-            + htmlmapperClassName);
+        String message = "Can't generate instance for class "
+            + htmlmapperClassName;
+        LOG.error(message);
+        throw new RuntimeException(message);
       }
     }
 
-    this.htmlParseFilters = new HtmlParseFilters(getConf());
-    this.utils = new DOMContentUtils(conf);
-    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+    htmlParseFilters = new HtmlParseFilters(getConf());
+    utils = new DOMContentUtils(conf);
+    cachingPolicy = getConf().get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
-    this.upperCaseElementNames = getConf().getBoolean(
-        "tika.uppercase.element.names", true);
+    upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names",
+        true);
+    useBoilerpipe = getConf().get("tika.extractor", "none")
+        .equals("boilerpipe");
+    boilerpipeExtractorName = getConf()
+        .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
   }
 
   public Configuration getConf() {


Mime
View raw message