Repository: any23
Updated Branches:
refs/heads/master 99f3f0ad9 -> eb5bd0939
ANY23-356 Removed nekohtml dependency
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3c8ee56f
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3c8ee56f
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3c8ee56f
Branch: refs/heads/master
Commit: 3c8ee56f55c97833d1f6a7a78c78ec15f8b5afd3
Parents: 1d5e0ec
Author: Hans <firedrake93@gmail.com>
Authored: Mon Jul 2 15:26:15 2018 -0500
Committer: Hans <firedrake93@gmail.com>
Committed: Mon Jul 2 18:24:38 2018 -0500
----------------------------------------------------------------------
.../resources/default-configuration.properties | 4 -
cli/pom.xml | 4 -
core/pom.xml | 10 +-
.../any23/extractor/html/TagSoupParser.java | 115 +------------------
.../html/TagSoupParsingConfiguration.java | 12 +-
plugins/html-scraper/pom.xml | 10 +-
pom.xml | 6 +-
7 files changed, 13 insertions(+), 148 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index a8ca0c2..4f68586 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -76,7 +76,3 @@ any23.extraction.csv.comment=#
# A confidence threshold for the OpenIE extractions
# Any extractions below this value will not be processed.
any23.extraction.openie.confidence.threshold=0.5
-
-# Use legacy setting to parse html
-# with NekoHTML instead of Jsoup
-any23.tagsoup.legacy=off
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 0cae013..0f04c62 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -110,10 +110,6 @@
<artifactId>commons-codec</artifactId>
</dependency>
<dependency>
- <groupId>net.sourceforge.nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- </dependency>
- <dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
</dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index e492fb6..377a5ee 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -232,11 +232,6 @@
<artifactId>commons-lang</artifactId>
</dependency>
- <dependency>
- <groupId>net.sourceforge.nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- </dependency>
-
<dependency> <!-- used by Tika -->
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
@@ -283,6 +278,11 @@
</dependency>
<!-- END: POI -->
+ <dependency>
+ <groupId>xerces</groupId>
+ <artifactId>xercesImpl</artifactId>
+ </dependency>
+
<!-- BEGIN: Test Dependencies -->
<dependency>
<groupId>junit</groupId>
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index d96a07b..4f54018 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -20,19 +20,10 @@ package org.apache.any23.extractor.html;
import org.apache.any23.validator.DefaultValidator;
import org.apache.any23.validator.Validator;
import org.apache.any23.validator.ValidatorException;
-import org.apache.xerces.xni.Augmentations;
-import org.apache.xerces.xni.QName;
-import org.apache.xerces.xni.XMLAttributes;
-import org.apache.xerces.xni.XNIException;
-import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-import javax.xml.transform.TransformerException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
@@ -42,13 +33,12 @@ import java.nio.charset.UnsupportedCharsetException;
/**
* <p>Parses an {@link java.io.InputStream}
- * into an <i>HTML DOM</i> tree using a <i>TagSoup</i> parser.
+ * into an <i>HTML DOM</i> tree.
* </p>
* <p><strong>Note:</strong> The resulting <i>DOM</i> tree
will not be namespace
* aware, and all element names will be upper case, while attributes
- * will be lower case. This is because the
- * <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i>
parser
- * by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces
HTML DOM</a>
+ * will be lower case. This is because the HTML parser
+ * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
* implementation, which doesn't support namespaces and forces uppercase element names. This
works
* with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so
we left it this way.</p>
*
@@ -61,8 +51,6 @@ public class TagSoupParser {
public static final String ELEMENT_LOCATION = "Element-Location";
- private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
-
private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
private final InputStream input;
@@ -139,103 +127,6 @@ public class TagSoupParser {
return new DocumentReport( validator.validate(dIRI, document, applyFix), document
);
}
-
- static TagSoupParsingConfiguration legacyConfig() {
- return NekoHTML.instance;
- }
-
- private static class NekoHTML extends TagSoupParsingConfiguration {
-
- private static final NekoHTML instance = new NekoHTML();
-
- @Override
- Document parse(InputStream input, String documentIRI, String encoding) throws IOException
{
- try {
- return parse(input, encoding);
- } catch (SAXException ex) {
- // should not happen, it's a tag soup parser
- throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
- } catch (TransformerException ex) {
- // should not happen, it's a tag soup parser
- throw new RuntimeException("Should not happen, it's a tag soup parser", ex);
- } catch (NullPointerException ex) {
- if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
- throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!",
ex);
- } else {
- throw ex;
- }
- }
- }
-
- private Document parse(InputStream input, String encoding) throws IOException, SAXException,
TransformerException {
- final DOMParser parser = new DOMParser() {
-
- private QName currentQName;
- private Augmentations currentAugmentations;
-
- @Override
- protected Element createElementNode(QName qName) {
- final Element created = super.createElementNode(qName);
- if (qName.equals(currentQName) && currentAugmentations != null)
{
- final ElementLocation elementLocation = createElementLocation(
- currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
- );
- created.setUserData(ELEMENT_LOCATION, elementLocation, null);
- }
- return created;
- }
-
- @Override
- public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations
augmentations)
- throws XNIException {
- super.startElement(qName, xmlAttributes, augmentations);
- currentQName = qName;
- currentAugmentations = augmentations;
- }
-
- private ElementLocation createElementLocation(Object obj) {
- if(obj == null) return null;
- String pattern = null;
- try {
- pattern = obj.toString();
- if( "synthesized".equals(pattern) ) return null;
- final String[] parts = pattern.split(":");
- return new ElementLocation(
- Integer.parseInt(parts[0]),
- Integer.parseInt(parts[1]),
- Integer.parseInt(parts[3]),
- Integer.parseInt(parts[4])
-
- );
- } catch (Exception e) {
- logger.warn(
- String.format("Unexpected string format for given augmentation:
[%s]", pattern),
- e
- );
- return null;
- }
- }
- };
- parser.setFeature("http://xml.org/sax/features/namespaces", false);
- parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims",
true);
- parser.setFeature(AUGMENTATIONS_FEATURE, true);
- if (encoding != null)
- parser.setProperty("http://cyberneko.org/html/properties/default-encoding",
encoding);
-
- /*
- * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to
the CyberNeko
- * parser. This will ensure the correct handling of inline HTML SPAN tags.
- * This fix is documented at issue #78.
- */
- parser.parse(new InputSource( new SpanCloserInputStream(input)));
- return parser.getDocument();
- }
-
-
- }
-
-
-
/**
* Describes a <i>DOM Element</i> location.
*/
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
index 2aeaac1..018a333 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -17,7 +17,6 @@
package org.apache.any23.extractor.html;
-import org.apache.any23.configuration.DefaultConfiguration;
import org.jsoup.nodes.Attribute;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
@@ -35,8 +34,6 @@ import java.io.InputStream;
*/
abstract class TagSoupParsingConfiguration {
- static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";
-
String name() {
return getClass().getSimpleName();
}
@@ -45,14 +42,7 @@ abstract class TagSoupParsingConfiguration {
static TagSoupParsingConfiguration getDefault() {
- return Default.instance;
- }
-
- private static class Default {
-
- private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton()
- .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance;
-
+ return JsoupConfig.instance;
}
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/plugins/html-scraper/pom.xml
----------------------------------------------------------------------
diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml
index e24f6b6..b651d73 100644
--- a/plugins/html-scraper/pom.xml
+++ b/plugins/html-scraper/pom.xml
@@ -51,19 +51,11 @@
<dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
- <scope>provided</scope>
+ <version>1.9.22</version>
</dependency>
<dependency>
<groupId>xerces</groupId>
<artifactId>xercesImpl</artifactId>
- <version>2.12.0</version>
- <scope>provided</scope>
- <exclusions>
- <exclusion>
- <groupId>xml-apis</groupId>
- <artifactId>xml-apis</artifactId>
- </exclusion>
- </exclusions>
</dependency>
<dependency>
<groupId>de.l3s.boilerpipe</groupId>
http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 8d3d408..1e57b2c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -382,9 +382,9 @@
<version>1.17</version>
</dependency>
<dependency>
- <groupId>net.sourceforge.nekohtml</groupId>
- <artifactId>nekohtml</artifactId>
- <version>1.9.20</version>
+ <groupId>xerces</groupId>
+ <artifactId>xercesImpl</artifactId>
+ <version>2.12.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
|