Repository: any23
Updated Branches:
refs/heads/master 5bc7e46a8 -> b0baa9407
Fix ANY23-308
- validate yaml file
- rename csvutils -> utils
- bring all utility class into util module
- update README
Signed-off-by: Jacek Grzebyta <grzebyta.dev@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ae036a7a
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ae036a7a
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ae036a7a
Branch: refs/heads/master
Commit: ae036a7af2a8c5a5572b6e17832f69bd8f4b4ba4
Parents: bd69aef
Author: Jacek Grzebyta <grzebyta.dev@gmail.com>
Authored: Tue Jul 11 11:57:16 2017 +0100
Committer: Jacek Grzebyta <grzebyta.dev@gmail.com>
Committed: Tue Jul 11 11:57:16 2017 +0100
----------------------------------------------------------------------
README.md | 2 +-
cli/pom.xml | 2 +-
.../org/apache/any23/cli/YAMLRoverTest.java | 76 +++++++++
core/pom.xml | 7 +-
.../any23/extractor/yaml/YAMLExtractor.java | 7 +-
.../any23/extractor/yaml/YAMLExtractorTest.java | 14 +-
.../extractor/yaml/YAMLTikaParserTest.java | 48 ++++++
csvutils/pom.xml | 106 ------------
.../any23/extractor/csv/CSVReaderBuilder.java | 166 -------------------
csvutils/src/test/resources/log4j.properties | 34 ----
mime/pom.xml | 2 +-
.../apache/any23/mime/TikaMIMETypeDetector.java | 17 +-
pom.xml | 7 +-
utils/pom.xml | 123 ++++++++++++++
.../any23/extractor/csv/CSVReaderBuilder.java | 166 +++++++++++++++++++
.../any23/extractor/yaml/YAMLValidator.java | 105 ++++++++++++
.../any23/yaml/utils/YAMLValidatorTest.java | 66 ++++++++
utils/src/test/resources/log4j.properties | 35 ++++
18 files changed, 659 insertions(+), 324 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 9db7126..6c52061 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ Any23 documentation can be found on the [website](http://any23.apache.org)
* [api](https://github.com/lewismc/any23/tree/master/api): Any23 library external API.
* [core](https://github.com/lewismc/any23/tree/master/core): The library core codebase.
- * [csvutils](https://github.com/lewismc/any23/tree/master/csvutils): A CSV specific package
+ * [utils](https://github.com/lewismc/any23/tree/master/utils): An utilities package
* [encoding](https://github.com/lewismc/any23/tree/master/encoding): Encoding detection library.
* [mime](https://github.com/lewismc/any23/tree/master/mime): MIME Type detection library.
* [nquads](https://github.com/lewismc/any23/tree/master/nquads): NQuads parsing and serialization library.
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 5acedfb..47b9c06 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -50,7 +50,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-csvutils</artifactId>
+ <artifactId>apache-any23-utils</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
----------------------------------------------------------------------
diff --git a/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
new file mode 100644
index 0000000..17e8916
--- /dev/null
+++ b/cli/src/test/java/org/apache/any23/cli/YAMLRoverTest.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.cli;
+
+import com.google.common.io.Files;
+import java.io.File;
+import java.io.IOException;
+import org.apache.pdfbox.util.Charsets;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Unit test for issue ANY23-308
+ *
+ * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
+ */
+public class YAMLRoverTest extends ToolTestBase {
+
+ private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
+
+ private static final String baseUri = "urn:test";
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ public YAMLRoverTest() {
+ super(Rover.class);
+ }
+
+ @Test
+ public void simpleTest()
+ throws Exception {
+ File outputFile = File.createTempFile("rover-test", ".ttl", tempDirectory);
+ File logfile = File.createTempFile("test-log", ".txt", tempDirectory);
+
+ int exitCode = runTool(String.format("-l %s -o %s -f turtle -e yaml,csv -d %s %s",
+ logfile.getAbsolutePath(),
+ outputFile.getAbsolutePath(),
+ baseUri,
+ copyResourceToTempFile(file1).getAbsolutePath()));
+
+ Assert.assertTrue(logfile.exists());
+ log.debug("Log file location: {}", logfile.getAbsolutePath());
+ log.info("Log file content: \n{}\n", Files.toString(logfile, Charsets.UTF_8));
+
+ Assert.assertEquals("Unexpected exit code.", 0, exitCode);
+ assertFileContainsString(outputFile, baseUri);
+ }
+
+ /**
+ *
+ * @param f
+ * @param s Expected string in the file
+ * @return
+ */
+ public void assertFileContainsString(File f, String s) throws IOException {
+ String fileContent = Files.toString(f, Charsets.UTF_8);
+ log.trace("File content: \n{}\n", fileContent);
+ Assert.assertTrue(fileContent.contains(s));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index f03c672..c410799 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -38,7 +38,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-csvutils</artifactId>
+ <artifactId>apache-any23-utils</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
@@ -78,11 +78,6 @@
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
</dependency>
- <dependency>
- <groupId>org.yaml</groupId>
- <artifactId>snakeyaml</artifactId>
- <version>1.17</version>
- </dependency>
<!-- BEGIN: Tika -->
<dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
index 64548f1..5c73082 100644
--- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java
@@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@@ -31,7 +29,6 @@ import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.util.StringUtils;
import org.apache.any23.vocab.YAML;
-import org.apache.commons.lang.WordUtils;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
@@ -54,7 +51,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
private int nodeId = 0;
- private IRI documentRoot;
+ private Resource documentRoot;
@Override
public void setStopAtFirstError(boolean f) {
@@ -65,7 +62,7 @@ public class YAMLExtractor implements Extractor.ContentExtractor {
ExtractionResult out)
throws IOException, ExtractionException {
IRI documentURI = context.getDocumentIRI();
- documentRoot = RDFUtils.uri(documentURI.toString() + "root");
+ documentRoot = makeUri("root", documentURI, false);
log.debug("process: {}", documentURI.toString());
out.writeNamespace(vocab.PREFIX, vocab.NS);
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
index 0cf8d14..b265c5f 100644
--- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java
@@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.RDFS;
import org.eclipse.rdf4j.repository.RepositoryResult;
-import org.semarglproject.vocab.XSD;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -93,4 +92,17 @@ public class YAMLExtractorTest extends AbstractExtractorTestCase {
RepositoryResult<Statement> docs = getStatements(null, null, RDF.NIL);
Assert.assertTrue(Iterations.asList(docs).size() == 2);
}
+
+ /**
+ * Comma separated values are parsed as well.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void csvTest()
+ throws Exception {
+ assertExtract("/org/apache/any23/extractor/csv/test-comma.csv");
+ log.debug(dumpModelToTurtle());
+ assertModelNotEmpty();
+ }
}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
----------------------------------------------------------------------
diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
new file mode 100644
index 0000000..4727c84
--- /dev/null
+++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLTikaParserTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.yaml;
+
+import java.io.InputStream;
+import org.apache.any23.mime.MIMEType;
+import org.apache.any23.mime.TikaMIMETypeDetector;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author jacek
+ */
+public class YAMLTikaParserTest {
+
+ private static final String file1 = "/org/apache/any23/extractor/yaml/simple-load.yml";
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ @Test
+ public void tikaDetect()
+ throws Exception {
+ InputStream is = YAMLTikaParserTest.class.getResourceAsStream(file1);
+ TikaMIMETypeDetector detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
+ MIMEType type = detector.guessMIMEType(null, is, null);
+
+ log.info("Type: {}", type.toString());
+
+ Assert.assertEquals("text/x-yaml", type.toString());
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/pom.xml
----------------------------------------------------------------------
diff --git a/csvutils/pom.xml b/csvutils/pom.xml
deleted file mode 100644
index 8f5b18d..0000000
--- a/csvutils/pom.xml
+++ /dev/null
@@ -1,106 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <artifactId>apache-any23</artifactId>
- <groupId>org.apache.any23</groupId>
- <version>2.1-SNAPSHOT</version>
- <relativePath>..</relativePath>
- </parent>
-
- <artifactId>apache-any23-csvutils</artifactId>
-
- <name>Apache Any23 :: CSV Utilities</name>
- <description>CSV specific library.</description>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-api</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-csv</artifactId>
- </dependency>
- <!-- Logging -->
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <version>${slf4j.logger.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <resources>
- <resource>
- <directory>${basedir}/../</directory>
- <targetPath>META-INF</targetPath>
- <includes>
- <include>LICENSE.txt</include>
- <include>NOTICE.txt</include>
- </includes>
- </resource>
- </resources>
- <pluginManagement>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-assembly-plugin</artifactId>
- <version>${maven-assembly-plugin.version}</version>
- <executions>
- <execution>
- <id>assembly</id>
- <phase>package</phase>
- <goals>
- <goal>single</goal>
- </goals>
- </execution>
- </executions>
- <configuration>
- <attach>true</attach>
- <skipAssembly>true</skipAssembly>
- <tarLongFileMode>gnu</tarLongFileMode>
- </configuration>
- </plugin>
- </plugins>
- </pluginManagement>
- </build>
-
- <profiles>
- <profile>
- <id>release</id>
- <build>
- <resources>
- <resource>
- <directory>${basedir}/../</directory>
- <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
- <includes>
- <include>LICENSE.txt</include>
- <include>NOTICE.txt</include>
- </includes>
- </resource>
- </resources>
- </build>
- </profile>
- </profiles>
-
-</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
deleted file mode 100644
index 75bb583..0000000
--- a/csvutils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.any23.extractor.csv;
-
-import org.apache.any23.configuration.DefaultConfiguration;
-import org.apache.commons.csv.CSVParser;
-import org.apache.commons.csv.CSVStrategy;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-
-/**
- * This class is responsible to build a reader first guessing the configuration
- * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
- *
- * @author Davide Palmisano ( dpalmisano@gmail.com )
- * @author Michele Mostarda ( michele.mostarda@gmail.com )
- */
-public class CSVReaderBuilder {
-
- private static final String DEFAULT_FIELD_DELIMITER = ",";
-
- private static final String DEFAULT_COMMENT_DELIMITER = "#";
-
- public static final char NULL_CHAR = ' ';
-
- private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
-
- private static DefaultConfiguration defaultConfiguration =
- DefaultConfiguration.singleton();
-
- private static final CSVStrategy[] strategies;
-
- static {
- strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
- strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
- int index = 1;
- for(char dlmt : popularDelimiters) {
- strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
- }
- }
-
- /**
- * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
- * from the provided <i>CSV</i> file.
- *
- * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
- * @return a {@link CSVParser}
- * @throws java.io.IOException
- */
- public static CSVParser build(InputStream is) throws IOException {
- CSVStrategy bestStrategy = getBestStrategy(is);
- if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
- return new CSVParser( new InputStreamReader(is), bestStrategy );
- }
-
- /**
- * Checks whether the given input stream is a CSV or not.
- *
- * @param is input stream to be verified.
- * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
- * <code>false</code> otherwise.
- * @throws IOException
- */
- public static boolean isCSV(InputStream is) throws IOException {
- return getBestStrategy(is) != null;
- }
-
- private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
- for( CSVStrategy strategy : strategies ) {
- if( testStrategy(is, strategy) ) {
- return strategy;
- }
- }
- return null;
- }
-
- private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
- return new CSVStrategy(delimiter, '\'', comment);
- }
-
- private static CSVStrategy getCSVStrategyFromConfiguration() {
- char fieldDelimiter = getCharValueFromConfiguration(
- "any23.extraction.csv.field",
- DEFAULT_FIELD_DELIMITER
- );
- char commentDelimiter = getCharValueFromConfiguration(
- "any23.extraction.csv.comment",
- DEFAULT_COMMENT_DELIMITER
- );
- return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
- }
-
- private static char getCharValueFromConfiguration(String property, String defaultValue) {
- String delimiter = defaultConfiguration.getProperty(
- property,
- defaultValue
- );
- if (delimiter.length() != 1 || delimiter.equals("")) {
- throw new RuntimeException(property + " value must be a single character");
- }
- return delimiter.charAt(0);
- }
-
- /**
- * make sure the reader has correct delimiter and quotation set.
- * Check first lines and make sure they have the same amount of columns and at least 2
- *
- * @param is input stream to be checked
- * @param strategy strategy to be verified.
- * @return
- * @throws IOException
- * @param is
- */
- private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
- final int MIN_COLUMNS = 2;
-
- is.mark(Integer.MAX_VALUE);
- try {
- final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
- int linesToCheck = 5;
- int headerColumnCount = -1;
- while (linesToCheck > 0) {
- String[] row;
- row = parser.getLine();
- if (row == null) {
- break;
- }
- if (row.length < MIN_COLUMNS) {
- return false;
- }
- if (headerColumnCount == -1) { // first row
- headerColumnCount = row.length;
- } else { // make sure rows have the same number of columns or one more than the header
- if (row.length < headerColumnCount) {
- return false;
- } else if (row.length - 1 > headerColumnCount) {
- return false;
- }
- }
- linesToCheck--;
- }
- return true;
- } finally {
- is.reset();
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/csvutils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/csvutils/src/test/resources/log4j.properties b/csvutils/src/test/resources/log4j.properties
deleted file mode 100644
index a7ad0af..0000000
--- a/csvutils/src/test/resources/log4j.properties
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-log4j.rootCategory=INFO, R, O
-
-# Stdout
-log4j.appender.O=org.apache.log4j.ConsoleAppender
-
-# File
-#log4j.appender.R=org.apache.log4j.RollingFileAppender
-#log4j.appender.R.File=log4j.log
-
-# Control the maximum log file size
-#log4j.appender.R.MaxFileSize=100KB
-
-# Archive log files (one backup file here)
-log4j.appender.R.MaxBackupIndex=1
-
-log4j.appender.R.layout=org.apache.log4j.PatternLayout
-log4j.appender.O.layout=org.apache.log4j.PatternLayout
-
-log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
-log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/pom.xml
----------------------------------------------------------------------
diff --git a/mime/pom.xml b/mime/pom.xml
index 9db7d3b..2014758 100644
--- a/mime/pom.xml
+++ b/mime/pom.xml
@@ -38,7 +38,7 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>apache-any23-csvutils</artifactId>
+ <artifactId>apache-any23-utils</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
----------------------------------------------------------------------
diff --git a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
index e0584a1..77955cb 100644
--- a/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
+++ b/mime/src/main/java/org/apache/any23/mime/TikaMIMETypeDetector.java
@@ -36,6 +36,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
+import org.apache.any23.extractor.yaml.YAMLValidator;
/**
* Implementation of {@link MIMETypeDetector} based on
@@ -134,6 +135,17 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
}
/**
+ * Checks if the stream contains a valid <i>YAML</i> content.
+ *
+ * @param is
+ * @return
+ * @throws IOException
+ */
+ public static boolean checkYAMLFormat(InputStream is) throws IOException {
+ return YAMLValidator.isYAML(is);
+ }
+
+ /**
* Tries to apply one of the given patterns on a sample of the input stream.
*
* @param patterns the patterns to apply.
@@ -263,8 +275,9 @@ public class TikaMIMETypeDetector implements MIMETypeDetector {
type = RDFFormat.TURTLE.getDefaultMIMEType();
} else if( checkCSVFormat(input) ) {
type = CSV_MIMETYPE;
- }
- else {
+ } else if (checkYAMLFormat(input)) { // YAML detection must be at the end
+ type = "text/x-yaml";
+ } else {
type = MimeTypes.OCTET_STREAM;
}
}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 23ab57f..ac2a9bd 100644
--- a/pom.xml
+++ b/pom.xml
@@ -199,7 +199,7 @@
<modules>
<module>api</module>
<module>test-resources</module>
- <module>csvutils</module>
+ <module>utils</module>
<module>mime</module>
<module>encoding</module>
<module>core</module>
@@ -527,6 +527,11 @@
<artifactId>metainf-services</artifactId>
<version>1.5</version>
</dependency>
+ <dependency>
+ <groupId>org.yaml</groupId>
+ <artifactId>snakeyaml</artifactId>
+ <version>1.17</version>
+ </dependency>
<!-- END: plugins -->
<!-- BEGIN: Test Dependencies -->
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/pom.xml
----------------------------------------------------------------------
diff --git a/utils/pom.xml b/utils/pom.xml
new file mode 100644
index 0000000..a6f34ec
--- /dev/null
+++ b/utils/pom.xml
@@ -0,0 +1,123 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>apache-any23</artifactId>
+ <groupId>org.apache.any23</groupId>
+ <version>2.1-SNAPSHOT</version>
+ <relativePath>..</relativePath>
+ </parent>
+
+ <artifactId>apache-any23-utils</artifactId>
+
+ <name>Apache Any23 :: Utilities</name>
+ <description>Utilities library</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-api</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-csv</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.yaml</groupId>
+ <artifactId>snakeyaml</artifactId>
+ </dependency>
+ <!-- Logging -->
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <version>${slf4j.logger.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <!-- Testing -->
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>apache-any23-test-resources</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ <pluginManagement>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <version>${maven-assembly-plugin.version}</version>
+ <executions>
+ <execution>
+ <id>assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <attach>true</attach>
+ <skipAssembly>true</skipAssembly>
+ <tarLongFileMode>gnu</tarLongFileMode>
+ </configuration>
+ </plugin>
+ </plugins>
+ </pluginManagement>
+ </build>
+
+ <profiles>
+ <profile>
+ <id>release</id>
+ <build>
+ <resources>
+ <resource>
+ <directory>${basedir}/../</directory>
+ <targetPath>${project.build.directory}/apidocs/META-INF</targetPath>
+ <includes>
+ <include>LICENSE.txt</include>
+ <include>NOTICE.txt</include>
+ </includes>
+ </resource>
+ </resources>
+ </build>
+ </profile>
+ </profiles>
+
+</project>
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
new file mode 100644
index 0000000..75bb583
--- /dev/null
+++ b/utils/src/main/java/org/apache/any23/extractor/csv/CSVReaderBuilder.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.any23.extractor.csv;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVStrategy;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+/**
+ * This class is responsible to build a reader first guessing the configuration
+ * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
+ *
+ * @author Davide Palmisano ( dpalmisano@gmail.com )
+ * @author Michele Mostarda ( michele.mostarda@gmail.com )
+ */
+public class CSVReaderBuilder {
+
+ private static final String DEFAULT_FIELD_DELIMITER = ",";
+
+ private static final String DEFAULT_COMMENT_DELIMITER = "#";
+
+ public static final char NULL_CHAR = ' ';
+
+ private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
+
+ private static DefaultConfiguration defaultConfiguration =
+ DefaultConfiguration.singleton();
+
+ private static final CSVStrategy[] strategies;
+
+ static {
+ strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
+ strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
+ int index = 1;
+ for(char dlmt : popularDelimiters) {
+ strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
+ }
+ }
+
+ /**
+ * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
+ * from the provided <i>CSV</i> file.
+ *
+ * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
+ * @return a {@link CSVParser}
+ * @throws java.io.IOException
+ */
+ public static CSVParser build(InputStream is) throws IOException {
+ CSVStrategy bestStrategy = getBestStrategy(is);
+ if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
+ return new CSVParser( new InputStreamReader(is), bestStrategy );
+ }
+
+ /**
+ * Checks whether the given input stream is a CSV or not.
+ *
+ * @param is input stream to be verified.
+ * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
+ * <code>false</code> otherwise.
+ * @throws IOException
+ */
+ public static boolean isCSV(InputStream is) throws IOException {
+ return getBestStrategy(is) != null;
+ }
+
+ private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
+ for( CSVStrategy strategy : strategies ) {
+ if( testStrategy(is, strategy) ) {
+ return strategy;
+ }
+ }
+ return null;
+ }
+
+ private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
+ return new CSVStrategy(delimiter, '\'', comment);
+ }
+
+ private static CSVStrategy getCSVStrategyFromConfiguration() {
+ char fieldDelimiter = getCharValueFromConfiguration(
+ "any23.extraction.csv.field",
+ DEFAULT_FIELD_DELIMITER
+ );
+ char commentDelimiter = getCharValueFromConfiguration(
+ "any23.extraction.csv.comment",
+ DEFAULT_COMMENT_DELIMITER
+ );
+ return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
+ }
+
+ private static char getCharValueFromConfiguration(String property, String defaultValue) {
+ String delimiter = defaultConfiguration.getProperty(
+ property,
+ defaultValue
+ );
+ if (delimiter.length() != 1 || delimiter.equals("")) {
+ throw new RuntimeException(property + " value must be a single character");
+ }
+ return delimiter.charAt(0);
+ }
+
+ /**
+ * make sure the reader has correct delimiter and quotation set.
+ * Check first lines and make sure they have the same amount of columns and at least 2
+ *
+ * @param is input stream to be checked
+ * @param strategy strategy to be verified.
+ * @return
+ * @throws IOException
+ * @param is
+ */
+ private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
+ final int MIN_COLUMNS = 2;
+
+ is.mark(Integer.MAX_VALUE);
+ try {
+ final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
+ int linesToCheck = 5;
+ int headerColumnCount = -1;
+ while (linesToCheck > 0) {
+ String[] row;
+ row = parser.getLine();
+ if (row == null) {
+ break;
+ }
+ if (row.length < MIN_COLUMNS) {
+ return false;
+ }
+ if (headerColumnCount == -1) { // first row
+ headerColumnCount = row.length;
+ } else { // make sure rows have the same number of columns or one more than the header
+ if (row.length < headerColumnCount) {
+ return false;
+ } else if (row.length - 1 > headerColumnCount) {
+ return false;
+ }
+ }
+ linesToCheck--;
+ }
+ return true;
+ } finally {
+ is.reset();
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
----------------------------------------------------------------------
diff --git a/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
new file mode 100644
index 0000000..5a5f63d
--- /dev/null
+++ b/utils/src/main/java/org/apache/any23/extractor/yaml/YAMLValidator.java
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.yaml;
+
+import com.google.common.collect.Iterables;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Scanner;
+import java.util.regex.Pattern;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.yaml.snakeyaml.Yaml;
+
+/**
+ * Utility class provides static methods for YAML validation.
+ *
+ * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
+ */
+public class YAMLValidator {
+
+ private static final Logger log = LoggerFactory.getLogger(YAMLValidator.class);
+
+ private static final Pattern YAML_PATTERN = Pattern.compile("^%YAML.*", Pattern.CASE_INSENSITIVE);
+
+ /**
+ * Detects if is contains valid YAML content.
+ * <p>
+ * In the first instance it checks if there is "%YAML" head. If not check
+ * using the brute force method by parsing input stream with yaml parser.
+ * </p>
+ * <p>
+ * NB. Only "false" results are trusted. Even if result is "true" you cannot
+ * be sure that InputStream contains YAML intentional context because
+ * comma-separated-values are pars-able by YAML parser as well.
+ * </p>
+ *
+ * @param is {@link InputStream}
+ * @return
+ * @throws IOException
+ */
+ public static boolean isYAML(InputStream is) throws IOException {
+ if (is == null) {
+ return false;
+ }
+
+ if (!is.markSupported()) {
+ is = new BufferedInputStream(is);
+ }
+
+ boolean result = false;
+
+ // mark the reading frame position. MUST BE FIRST
+ is.mark(Integer.MAX_VALUE);
+
+ while (true) {
+ // if is is empty than return false
+ if (is.available() <= 0) {
+ break;
+ }
+
+ Scanner sc = new Scanner(is);
+ String out = sc.findWithinHorizon(YAML_PATTERN, 0);
+
+ if (out != null && !out.isEmpty()) {
+ log.debug("Head: {}", out);
+ result = true;
+ break;
+ }
+ log.debug("Still not found. output is: {}", out);
+ is.reset();
+
+ try {
+ Yaml yml = new Yaml();
+ Iterable<Object> parsedOut = yml.loadAll(is);
+
+ if (Iterables.size(parsedOut) > 0) {
+ result = true;
+ break;
+ }
+ } catch (Exception ex) {
+ //do nothing
+ }
+
+ // final break
+ break;
+ }
+
+ is.reset(); // MUST BE AT THE END
+ return result;
+ }
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
----------------------------------------------------------------------
diff --git a/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
new file mode 100644
index 0000000..fddf2fb
--- /dev/null
+++ b/utils/src/test/java/org/apache/any23/yaml/utils/YAMLValidatorTest.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2017 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.yaml.utils;
+
+import org.apache.any23.extractor.yaml.YAMLValidator;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author Jacek Grzebyta ( grzebyta.dev [at] gmail.com)
+ */
+@RunWith(Parameterized.class)
+public class YAMLValidatorTest {
+
+ private String path;
+
+ private Boolean expected;
+
+ private Logger log = LoggerFactory.getLogger(getClass());
+
+ public YAMLValidatorTest(String path, Boolean expected) {
+ this.path = path;
+ this.expected = expected;
+ }
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> getFiles() {
+ return Arrays.asList(new Object[][]{
+ {"/org/apache/any23/extractor/yaml/simple-load.yml", Boolean.TRUE},
+ {"/org/apache/any23/extractor/yaml/simple-load_no_head.yml", Boolean.TRUE},
+ {"/org/apache/any23/extractor/yaml/different-integers.yml", Boolean.TRUE},
+ {"/org/apache/any23/extractor/yaml/different-float.yml", Boolean.TRUE},
+ {"/org/apache/any23/extractor/csv/test-comma.csv", Boolean.TRUE}});
+ }
+
+ @Test
+ public void runTest()
+ throws Exception {
+ log.info("Try path: {}", path);
+ InputStream is = YAMLValidatorTest.class.getResourceAsStream(path);
+ boolean result = YAMLValidator.isYAML(is);
+ log.debug("Test resutl: {}", result);
+ Assert.assertSame(expected, result);
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/any23/blob/ae036a7a/utils/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/utils/src/test/resources/log4j.properties b/utils/src/test/resources/log4j.properties
new file mode 100644
index 0000000..3860396
--- /dev/null
+++ b/utils/src/test/resources/log4j.properties
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# log4j.rootCategory=INFO, R, O
+log4j.rootCategory=INFO, O
+
+# Stdout
+log4j.appender.O=org.apache.log4j.ConsoleAppender
+
+# File
+#log4j.appender.R=org.apache.log4j.RollingFileAppender
+#log4j.appender.R.File=log4j.log
+
+# Control the maximum log file size
+#log4j.appender.R.MaxFileSize=100KB
+
+# Archive log files (one backup file here)
+log4j.appender.R.MaxBackupIndex=1
+
+log4j.appender.R.layout=org.apache.log4j.PatternLayout
+log4j.appender.O.layout=org.apache.log4j.PatternLayout
+
+log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
+log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
|