any23-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From hansbre...@apache.org
Subject [1/6] any23 git commit: ANY23-356 Removed nekohtml dependency
Date Thu, 05 Jul 2018 21:25:38 GMT
Repository: any23
Updated Branches:
  refs/heads/master 99f3f0ad9 -> eb5bd0939


ANY23-356 Removed nekohtml dependency


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3c8ee56f
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3c8ee56f
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3c8ee56f

Branch: refs/heads/master
Commit: 3c8ee56f55c97833d1f6a7a78c78ec15f8b5afd3
Parents: 1d5e0ec
Author: Hans <firedrake93@gmail.com>
Authored: Mon Jul 2 15:26:15 2018 -0500
Committer: Hans <firedrake93@gmail.com>
Committed: Mon Jul 2 18:24:38 2018 -0500

----------------------------------------------------------------------
 .../resources/default-configuration.properties  |   4 -
 cli/pom.xml                                     |   4 -
 core/pom.xml                                    |  10 +-
 .../any23/extractor/html/TagSoupParser.java     | 115 +------------------
 .../html/TagSoupParsingConfiguration.java       |  12 +-
 plugins/html-scraper/pom.xml                    |  10 +-
 pom.xml                                         |   6 +-
 7 files changed, 13 insertions(+), 148 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index a8ca0c2..4f68586 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -76,7 +76,3 @@ any23.extraction.csv.comment=#
 # A confidence threshold for the OpenIE extractions
 # Any extractions below this value will not be processed.
 any23.extraction.openie.confidence.threshold=0.5
-
-# Use legacy setting to parse html
-# with NekoHTML instead of Jsoup
-any23.tagsoup.legacy=off

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 0cae013..0f04c62 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -110,10 +110,6 @@
       <artifactId>commons-codec</artifactId>
     </dependency>
     <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-    </dependency>
-    <dependency>
       <groupId>com.beust</groupId>
       <artifactId>jcommander</artifactId>
     </dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index e492fb6..377a5ee 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -232,11 +232,6 @@
       <artifactId>commons-lang</artifactId>
     </dependency>
 
-    <dependency>
-      <groupId>net.sourceforge.nekohtml</groupId>
-      <artifactId>nekohtml</artifactId>
-    </dependency>
-
     <dependency> <!-- used by Tika -->
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
@@ -283,6 +278,11 @@
     </dependency>
     <!-- END: POI -->
 
+    <dependency>
+      <groupId>xerces</groupId>
+      <artifactId>xercesImpl</artifactId>
+    </dependency>
+
     <!-- BEGIN: Test Dependencies -->
     <dependency>
       <groupId>junit</groupId>

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index d96a07b..4f54018 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -20,19 +20,10 @@ package org.apache.any23.extractor.html;
 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.XNIException;
-import org.cyberneko.html.parsers.DOMParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
 
-import javax.xml.transform.TransformerException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
@@ -42,13 +33,12 @@ import java.nio.charset.UnsupportedCharsetException;
 
 /**
  * <p>Parses an {@link java.io.InputStream}
- * into an <i>HTML DOM</i> tree using a <i>TagSoup</i> parser.
+ * into an <i>HTML DOM</i> tree.
  * </p>
  * <p><strong>Note:</strong> The resulting <i>DOM</i> tree
will not be namespace
  * aware, and all element names will be upper case, while attributes
- * will be lower case. This is because the
- * <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i>
parser
- * by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces
HTML DOM</a>
+ * will be lower case. This is because the HTML parser
+ * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
  * implementation, which doesn't support namespaces and forces uppercase element names. This
works
  * with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so
we left it this way.</p>
  *
@@ -61,8 +51,6 @@ public class TagSoupParser {
 
     public static final String ELEMENT_LOCATION = "Element-Location";
 
-    private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
-
     private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
 
     private final InputStream input;
@@ -139,103 +127,6 @@ public class TagSoupParser {
         return new DocumentReport( validator.validate(dIRI, document, applyFix), document
);
     }
 
-
-    static TagSoupParsingConfiguration legacyConfig() {
-        return NekoHTML.instance;
-    }
-
-    private static class NekoHTML extends TagSoupParsingConfiguration {
-
-        private static final NekoHTML instance = new NekoHTML();
-
-        @Override
-        Document parse(InputStream input, String documentIRI, String encoding) throws IOException
{
-            try {
-                return parse(input, encoding);
-            } catch (SAXException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
-            } catch (TransformerException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
-            } catch (NullPointerException ex) {
-                if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
-                    throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!",
ex);
-                } else {
-                    throw ex;
-                }
-            }
-        }
-
-        private Document parse(InputStream input, String encoding) throws IOException, SAXException,
TransformerException {
-            final DOMParser parser = new DOMParser() {
-
-                private QName currentQName;
-                private Augmentations currentAugmentations;
-
-                @Override
-                protected Element createElementNode(QName qName) {
-                    final Element created = super.createElementNode(qName);
-                    if (qName.equals(currentQName) && currentAugmentations != null)
{
-                        final ElementLocation elementLocation = createElementLocation(
-                                currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
-                        );
-                        created.setUserData(ELEMENT_LOCATION, elementLocation, null);
-                    }
-                    return created;
-                }
-
-                @Override
-                public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations
augmentations)
-                        throws XNIException {
-                    super.startElement(qName, xmlAttributes, augmentations);
-                    currentQName = qName;
-                    currentAugmentations = augmentations;
-                }
-
-                private ElementLocation createElementLocation(Object obj) {
-                    if(obj == null) return null;
-                    String pattern = null;
-                    try {
-                        pattern = obj.toString();
-                        if( "synthesized".equals(pattern) ) return null;
-                        final String[] parts = pattern.split(":");
-                        return new ElementLocation(
-                                Integer.parseInt(parts[0]),
-                                Integer.parseInt(parts[1]),
-                                Integer.parseInt(parts[3]),
-                                Integer.parseInt(parts[4])
-
-                        );
-                    } catch (Exception e) {
-                        logger.warn(
-                                String.format("Unexpected string format for given augmentation:
[%s]", pattern),
-                                e
-                        );
-                        return null;
-                    }
-                }
-            };
-            parser.setFeature("http://xml.org/sax/features/namespaces", false);
-            parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims",
true);
-            parser.setFeature(AUGMENTATIONS_FEATURE, true);
-            if (encoding != null)
-                parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
encoding);
-
-            /*
-             * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to
the CyberNeko
-             *       parser. This will ensure the correct handling of inline HTML SPAN tags.
-             *       This fix is documented at issue #78.
-             */
-            parser.parse(new InputSource( new SpanCloserInputStream(input)));
-            return parser.getDocument();
-        }
-
-
-    }
-
-
-
     /**
      * Describes a <i>DOM Element</i> location.
      */

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
index 2aeaac1..018a333 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -17,7 +17,6 @@
 
 package org.apache.any23.extractor.html;
 
-import org.apache.any23.configuration.DefaultConfiguration;
 import org.jsoup.nodes.Attribute;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
@@ -35,8 +34,6 @@ import java.io.InputStream;
  */
 abstract class TagSoupParsingConfiguration {
 
-    static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";
-
     String name() {
         return getClass().getSimpleName();
     }
@@ -45,14 +42,7 @@ abstract class TagSoupParsingConfiguration {
 
 
     static TagSoupParsingConfiguration getDefault() {
-        return Default.instance;
-    }
-
-    private static class Default {
-
-        private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton()
-                .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance;
-
+        return JsoupConfig.instance;
     }
 
 

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/plugins/html-scraper/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml
index e24f6b6..b651d73 100644
--- a/plugins/html-scraper/pom.xml
+++ b/plugins/html-scraper/pom.xml
@@ -51,19 +51,11 @@
     <dependency>
       <groupId>net.sourceforge.nekohtml</groupId>
       <artifactId>nekohtml</artifactId>
-      <scope>provided</scope>
+      <version>1.9.22</version>
     </dependency>
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
-      <version>2.12.0</version>
-      <scope>provided</scope>
-      <exclusions>
-        <exclusion>
-          <groupId>xml-apis</groupId>
-          <artifactId>xml-apis</artifactId>
-        </exclusion>
-      </exclusions>
     </dependency>
     <dependency>
       <groupId>de.l3s.boilerpipe</groupId>

http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 8d3d408..1e57b2c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -382,9 +382,9 @@
         <version>1.17</version>
       </dependency>
       <dependency>
-        <groupId>net.sourceforge.nekohtml</groupId>
-        <artifactId>nekohtml</artifactId>
-        <version>1.9.20</version>
+        <groupId>xerces</groupId>
+        <artifactId>xercesImpl</artifactId>
+        <version>2.12.0</version>
       </dependency>
       <dependency>
         <groupId>org.jsoup</groupId>


Mime
View raw message