Repository: any23
Updated Branches:
refs/heads/master d283d70ce -> 6173637bb
ANY23-376 fix IllegalArgumentException in microdata extractor
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6173637b
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6173637b
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6173637b
Branch: refs/heads/master
Commit: 6173637bb801da62b07b69be64fa2c75f8d54904
Parents: d283d70
Author: Hans <firedrake93@gmail.com>
Authored: Tue Jul 31 15:35:55 2018 -0500
Committer: Hans <firedrake93@gmail.com>
Committed: Tue Jul 31 15:35:55 2018 -0500
----------------------------------------------------------------------
.../extractor/microdata/MicrodataParser.java | 11 +-
.../microdata/MicrodataExtractorTest.java | 15 ++-
.../microdata-bad-properties-expected.nquads | 84 +++++++++++++
.../microdata/microdata-bad-properties.html | 125 +++++++++++++++++++
4 files changed, 231 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index 32faec3..f305620 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -17,6 +17,7 @@
package org.apache.any23.extractor.microdata;
import org.apache.any23.extractor.html.DomUtils;
+import org.apache.commons.lang.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
@@ -394,9 +395,15 @@ public class MicrodataParser {
while (treeWalker.nextNode() != null);
final List<ItemProp> result = new ArrayList<>();
- for(Node itemPropNode : accepted) {
+ for (Node itemPropNode : accepted) {
final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE,
null);
- final String[] propertyNames = itemProp.split(" ");
+
+ if (StringUtils.isBlank(itemProp)) {
+ manageError(new MicrodataParserException("invalid property name '" + itemProp
+ "'", itemPropNode));
+ continue;
+ }
+
+ final String[] propertyNames = itemProp.trim().split("\\s+");
ItemPropValue itemPropValue;
for (String propertyName : propertyNames) {
try {
http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index 280b3f7..e858ea3 100644
--- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -19,6 +19,7 @@ package org.apache.any23.extractor.microdata;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.html.AbstractExtractorTestCase;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
@@ -89,7 +90,6 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase {
assertExtract("/microdata/microdata-missing-scheme.html");
assertModelNotEmpty();
assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
- System.out.println(dumpHumanReadableTriples());
}
/**
@@ -206,9 +206,20 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase
{
extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
}
+ @Test
+ public void testBadPropertyNames() throws IOException {
+ extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads",
false);
+ assertIssue(IssueReport.IssueLevel.ERROR, ".*invalid property name ''.*\"path\" :
\"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
+ }
+
private void extractAndVerifyAgainstNQuads(String actual, String expected)
+ throws RepositoryException, RDFHandlerException, IOException, RDFParseException
{
+ extractAndVerifyAgainstNQuads(actual, expected, true);
+ }
+
+ private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
- assertExtract("/microdata/" + actual);
+ assertExtract("/microdata/" + actual, assertNoIssues);
assertModelNotEmpty();
logger.debug( dumpModelToNQuads() );
List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
new file mode 100644
index 0000000..e5b6f29
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/microdata-bad-properties-expected.nquads
@@ -0,0 +1,84 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+_:node1cjov1p83x2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event>
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/endDate> "2018-07-29T17:00:00-07:00" <http://bob.example.com/>
.
+_:node1cjov1p83x2 <http://schema.org/name> "Midwest Fire Fest" <http://bob.example.com/>
.
+_:node1cjov1p83x2 <http://schema.org/description> "Come to the most unique festival
in the Midwest" <http://bob.example.com/> .
+_:node1cjov1p83x3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place>
<http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/hasMap> "http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523"
<http://bob.example.com/> .
+_:node1cjov1p83x3 <http://schema.org/name> "Westside Park" <http://bob.example.com/>
.
+_:node1cjov1p83x2 <http://schema.org/location> _:node1cjov1p83x3 <http://bob.example.com/>
.
+_:node1cjov1p83x2 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000>
<http://bob.example.com/> .
+_:node1cjov1p83x2 <http://schema.org/startDate> "2018-07-29T09:00:00-07:00" <http://bob.example.com/>
.
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x2
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event>
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/endDate> "2018-07-31T13:00:00-07:00" <http://bob.example.com/>
.
+_:node1cjov1p83x4 <http://schema.org/name> "Cambridge Senior Meals" <http://bob.example.com/>
.
+_:node1cjov1p83x4 <http://schema.org/description> "Cambridge Senior Meals are served at
Noon every Tuesday and Friday" <http://bob.example.com/> .
+_:node1cjov1p83x5 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place>
<http://bob.example.com/> .
+_:node1cjov1p83x6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress>
<http://bob.example.com/> .
+_:node1cjov1p83x6 <http://schema.org/streetAddress> "200 Spring Steet" <http://bob.example.com/>
.
+_:node1cjov1p83x6 <http://schema.org/postalCode> "53523" <http://bob.example.com/>
.
+_:node1cjov1p83x6 <http://schema.org/addressLocality> "Cambridge" <http://bob.example.com/>
.
+_:node1cjov1p83x6 <http://schema.org/addressRegion> "WI" <http://bob.example.com/>
.
+_:node1cjov1p83x5 <http://schema.org/address> _:node1cjov1p83x6 <http://bob.example.com/>
.
+_:node1cjov1p83x5 <http://schema.org/hasMap> "http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523"
<http://bob.example.com/> .
+_:node1cjov1p83x5 <http://schema.org/name> "Amundson Center" <http://bob.example.com/>
.
+_:node1cjov1p83x4 <http://schema.org/location> _:node1cjov1p83x5 <http://bob.example.com/>
.
+_:node1cjov1p83x4 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000>
<http://bob.example.com/> .
+_:node1cjov1p83x4 <http://schema.org/startDate> "2018-07-31T12:00:00-07:00" <http://bob.example.com/>
.
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x4
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event>
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/endDate> "2018-07-31T19:00:00-07:00" <http://bob.example.com/>
.
+_:node1cjov1p83x7 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/>
.
+_:node1cjov1p83x7 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers
- Cambridge's speciality yarn,..." <http://bob.example.com/> .
+_:node1cjov1p83x8 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place>
<http://bob.example.com/> .
+_:node1cjov1p83x9 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress>
<http://bob.example.com/> .
+_:node1cjov1p83x9 <http://schema.org/streetAddress> "Null" <http://bob.example.com/>
.
+_:node1cjov1p83x8 <http://schema.org/address> _:node1cjov1p83x9 <http://bob.example.com/>
.
+_:node1cjov1p83x8 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street"
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/location> _:node1cjov1p83x8 <http://bob.example.com/>
.
+_:node1cjov1p83x7 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000>
<http://bob.example.com/> .
+_:node1cjov1p83x7 <http://schema.org/startDate> "2018-07-31T17:00:00-07:00" <http://bob.example.com/>
.
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x7
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event>
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/>
.
+_:node1cjov1p83x10 <http://schema.org/name> "Cambridge Historic School Museum Tour"
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/description> "Built in 1906, the Cambridge Historic
School - listed on the..." <http://bob.example.com/> .
+_:node1cjov1p83x11 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place>
<http://bob.example.com/> .
+_:node1cjov1p83x12 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress>
<http://bob.example.com/> .
+_:node1cjov1p83x12 <http://schema.org/streetAddress> "Null" <http://bob.example.com/>
.
+_:node1cjov1p83x11 <http://schema.org/address> _:node1cjov1p83x12 <http://bob.example.com/>
.
+_:node1cjov1p83x11 <http://schema.org/name> "Cambridge Historic School" <http://bob.example.com/>
.
+_:node1cjov1p83x10 <http://schema.org/location> _:node1cjov1p83x11 <http://bob.example.com/>
.
+_:node1cjov1p83x10 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000>
<http://bob.example.com/> .
+_:node1cjov1p83x10 <http://schema.org/startDate> "2018-08-01T12:30:00-07:00" <http://bob.example.com/>
.
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x10
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Event>
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/endDate> "2018-08-01T15:00:00-07:00" <http://bob.example.com/>
.
+_:node1cjov1p83x13 <http://schema.org/name> "Begin to Knit Classes" <http://bob.example.com/>
.
+_:node1cjov1p83x13 <http://schema.org/description> "Learn to knit at Kaleidoscope Fibers
- Cambridge's speciality yarn,..." <http://bob.example.com/> .
+_:node1cjov1p83x14 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Place>
<http://bob.example.com/> .
+_:node1cjov1p83x15 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/PostalAddress>
<http://bob.example.com/> .
+_:node1cjov1p83x15 <http://schema.org/streetAddress> "Null" <http://bob.example.com/>
.
+_:node1cjov1p83x14 <http://schema.org/address> _:node1cjov1p83x15 <http://bob.example.com/>
.
+_:node1cjov1p83x14 <http://schema.org/name> "Kaleidoscope Fibers (131 W. Main Street"
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/location> _:node1cjov1p83x14 <http://bob.example.com/>
.
+_:node1cjov1p83x13 <http://schema.org/url> <https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000>
<http://bob.example.com/> .
+_:node1cjov1p83x13 <http://schema.org/startDate> "2018-08-01T13:00:00-07:00" <http://bob.example.com/>
.
+<http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node1cjov1p83x13
<http://bob.example.com/> .
http://git-wip-us.apache.org/repos/asf/any23/blob/6173637b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/microdata-bad-properties.html b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
new file mode 100644
index 0000000..23d4e80
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/microdata-bad-properties.html
@@ -0,0 +1,125 @@
+<!DOCTYPE html>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- Excerpted from: https://cambridgewi.com/events-calendar/ -->
+<html>
+
+<head></head>
+
+<body>
+<div itemscope="" itemtype="http://schema.org/Event">
+ <div>
+ <div>
+ <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180729090000%2C20180729170000"
itemprop="url"><span itemprop="name">Midwest Fire Fest</span></a>
+ <div><span>Jul 29, 2018</span> <span>9:00am</span></div>
+ </div>
+ <div itemprop="description">Come to the most unique festival in the Midwest</div>
+ </div>
+ <meta itemprop=" startDate " content="2018-07-29T09:00:00-07:00">
+ <meta itemprop=" endDate " content="2018-07-29T17:00:00-07:00">
+ <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+ <meta itemprop=" name" content="Westside Park">
+ <meta itemprop="hasMap " content="http://maps.google.com/?q=300+Water+St%2C+Cambridge%2C+WI+53523">
+ <div itemprop="" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+ <meta itemprop="streetAddress" content="300 Water Street">
+ <meta itemprop="addressLocality" content="Cambridge">
+ <meta itemprop="addressRegion" content="WI">
+ <meta itemprop="postalCode" content="53523">
+ </div>
+ </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+ <div>
+ <div>
+ <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731120000%2C20180731130000"
itemprop="url"><span itemprop="name">Cambridge Senior Meals</span></a>
+ <div><span>Jul 31, 2018</span> <span>12:00pm</span></div>
+ </div>
+ <div itemprop="description">Cambridge Senior Meals are served at Noon
every Tuesday and Friday</div>
+ </div>
+ <meta itemprop="startDate" content="2018-07-31T12:00:00-07:00">
+ <meta itemprop="endDate" content="2018-07-31T13:00:00-07:00">
+ <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+ <meta itemprop="name" content="Amundson Center">
+ <meta itemprop="hasMap" content="http://maps.google.com/?q=200+Spring+Street%2C+Cambridge%2C+WI+53523">
+ <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+ <meta itemprop="streetAddress" content="200 Spring Steet">
+ <meta itemprop="addressLocality" content="Cambridge">
+ <meta itemprop="addressRegion" content="WI">
+ <meta itemprop="postalCode" content="53523">
+ </div>
+ </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+ <div>
+ <div>
+ <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180731170000%2C20180731190000"
itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+ <div><span>Jul 31, 2018</span> <span>5:00pm</span></div>
+
+ </div>
+ <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's
speciality yarn,...</div>
+ </div>
+ <meta itemprop="startDate" content="2018-07-31T17:00:00-07:00">
+ <meta itemprop="endDate" content="2018-07-31T19:00:00-07:00">
+ <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+ <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street">
+ <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+ <meta itemprop="streetAddress" content="">
+ </div>
+ </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+ <div>
+ <div>
+ <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801123000%2C20180801150000"
itemprop="url"><span itemprop="name">Cambridge Historic School Museum Tour</span></a>
+ <div><span>Aug 1, 2018</span> <span>12:30pm</span></div>
+ </div>
+ <div itemprop="description">Built in 1906, the Cambridge Historic School - listed
on the...</div>
+ </div>
+ <div class="rhc-clear"></div>
+ <meta itemprop="startDate" content="2018-08-01T12:30:00-07:00">
+ <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+ <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+ <meta itemprop="name" content="Cambridge Historic School">
+ <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+ <meta itemprop="streetAddress" content="">
+ </div>
+ </div>
+</div>
+
+<div itemscope="" itemtype="http://schema.org/Event">
+ <div>
+ <div>
+ <a href="https://cambridgewi.com/events-calendar/?event_rdate=20180801130000%2C20180801150000"
itemprop="url"><span itemprop="name">Begin to Knit Classes</span></a>
+ <div><span>Aug 1, 2018</span> <span>1:00pm</span></div>
+ </div>
+ <div itemprop="description">Learn to knit at Kaleidoscope Fibers - Cambridge's
speciality yarn,...</div>
+ </div>
+ <meta itemprop="startDate" content="2018-08-01T13:00:00-07:00">
+ <meta itemprop="endDate" content="2018-08-01T15:00:00-07:00">
+ <div itemprop="location" itemscope="itemscope" itemtype="http://schema.org/Place">
+ <meta itemprop="name" content="Kaleidoscope Fibers (131 W. Main Street">
+ <div itemprop="address" itemscope="itemscope" itemtype="http://schema.org/PostalAddress">
+ <meta itemprop="streetAddress" content="">
+ </div>
+ </div>
+</div>
+
+</body>
+</html>
\ No newline at end of file
|