datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject [1/4] git commit: DATAFU-8: Port OpenNLP to DataFu
Date Fri, 31 Jan 2014 04:17:34 GMT
Updated Branches:
  refs/heads/master 9a4264504 -> d8cec6a5c


DATAFU-8: Port OpenNLP to DataFu

https://issues.apache.org/jira/browse/DATAFU-8

Signed-off-by: Matt Hayes <mhayes@linkedin.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/99e46e2c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/99e46e2c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/99e46e2c

Branch: refs/heads/master
Commit: 99e46e2cd80be6d7cecf56308b17a707d61d6660
Parents: 41a0c2c
Author: Russell Jurney <russell.jurney@gmail.com>
Authored: Wed Jan 29 15:04:41 2014 -0800
Committer: Matt Hayes <mhayes@linkedin.com>
Committed: Wed Jan 29 15:04:57 2014 -0800

----------------------------------------------------------------------
 .gitignore                                      |   4 +
 .travis.yml                                     |   1 +
 build.xml                                       |  16 +-
 ivy.xml                                         |   7 +-
 ivy/libraries.properties                        |   2 +
 .../datafu/pig/text/opennlp/CachedFile.java     |  41 ++++
 src/java/datafu/pig/text/opennlp/POSTag.java    | 177 +++++++++++++++++
 .../datafu/pig/text/opennlp/SentenceDetect.java | 122 ++++++++++++
 .../datafu/pig/text/opennlp/TokenizeME.java     | 126 ++++++++++++
 .../datafu/pig/text/opennlp/TokenizeSimple.java | 101 ++++++++++
 .../pig/text/opennlp/TokenizeWhitespace.java    | 102 ++++++++++
 test/pig/datafu/test/pig/text/NLPTests.java     | 195 +++++++++++++++++++
 12 files changed, 889 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/.gitignore
----------------------------------------------------------------------
diff --git a/.gitignore b/.gitignore
index fd6a420..81e9ae8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ pigunit-input-overriden.txt
 *.asc
 /bin/
 .ant-targets-build.xml
+/data/
+*.iml
+.idea
+.DS_Store

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/.travis.yml
----------------------------------------------------------------------
diff --git a/.travis.yml b/.travis.yml
index 8fd4bc7..669b97a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,5 +12,6 @@ env:
   - TESTFOLDER=stats
   - TESTFOLDER=urls
   - TESTFOLDER=util
+  - TESTFOLDER=text
 jdk:
   - openjdk6

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/build.xml
----------------------------------------------------------------------
diff --git a/build.xml b/build.xml
index 8be0cbf..e04bc4e 100644
--- a/build.xml
+++ b/build.xml
@@ -27,6 +27,7 @@
   <property name="test.dir" value="${basedir}/test" />
   <property name="pigtestsrc.dir" value="${test.dir}/pig" />
   <property name="dist.dir" value="${basedir}/dist" />
+  <property name="data.dir" value="${basedir}/data" />
   <property name="tools.dir" value="${basedir}/tools" />
   <property name="lib.dir" value="${basedir}/lib" />
   <property name="common.lib.dir" value="${lib.dir}/common" />
@@ -49,6 +50,7 @@
   <property name="commons-math.jar" value="commons-math-jar-${commons-math.version}.jar"
/>
   <property name="guava.jar" value="guava-jar-${guava.version}.jar" />
   <property name="stream.jar" value="stream-jar-${stream.version}.jar" />
+  <property name="opennlp.jar" value="opennlp-tools-bundle-${opennlp.version}.jar" />
 
   <!-- Java configuration -->
   <property name="targetJavaVersion" value="1.5" />
@@ -109,6 +111,13 @@
     <mkdir dir="${ivy.jar.dir}"/>
     <get src="${maven.jar.repo.url}" dest="${maven.jar}" usetimestamp="true"/>
   </target>
+
+  <target name="opennlp-model-download" description="Download OpenNLP models">
+    <mkdir dir="${data.dir}"/>
+    <get src="http://opennlp.sourceforge.net/models-1.5/en-pos-maxent.bin" dest="${data.dir}/en-pos-maxent.bin"
usetimestamp="true"/>
+    <get src="http://opennlp.sourceforge.net/models-1.5/en-sent.bin" dest="${data.dir}/en-sent.bin"
usetimestamp="true"/>
+    <get src="http://opennlp.sourceforge.net/models-1.5/en-token.bin" dest="${data.dir}/en-token.bin"
usetimestamp="true"/>
+  </target>
   
   <target name="maven-taskdef" depends="maven-ant-tasks-jar-download">
     <path id="mvn-ant-task.classpath" path="${maven.jar}"/>
@@ -346,7 +355,7 @@
     <delete file="${dist.dir}/${final.name}-orig.jar" />
     <move file="${dist.dir}/${final.name}.jar" tofile="${dist.dir}/${final.name}-orig.jar"
/>
     <java jar="${tools.dir}/autojar.jar" fork="true">
-      <arg line="-baeq -o ${dist.dir}/${final.name}.jar -c ${packaged.lib.dir}/${fastutil.jar}:${packaged.lib.dir}/${commons-math.jar}:${packaged.lib.dir}/${stream.jar}:${packaged.lib.dir}/${guava.jar}
${dist.dir}/${final.name}-orig.jar" />
+      <arg line="-baeq -o ${dist.dir}/${final.name}.jar -c ${packaged.lib.dir}/*.jar ${dist.dir}/${final.name}-orig.jar"
/>
     </java>
     <delete file="${dist.dir}/${final.name}-orig.jar" />
     
@@ -360,6 +369,7 @@
         <rule pattern="org.apache.commons.math.**" result="datafu.org.apache.commons.math.@1"/>
         <rule pattern="com.clearspring.analytics.**" result="datafu.com.clearspring.analytics.@1"/>
         <rule pattern="com.google.common.**" result="datafu.com.google.common.@1"/>
+        <rule pattern="opennlp.**" result="datafu.opennlp.@1"/>
     </jarjar>
     <delete file="${dist.dir}/${final.name}-orig.jar" />
   </target>
@@ -388,7 +398,7 @@
     </jarjar>
   </target>
   
-  <target name="test" depends="build-pig-tests, jar" description="Runs the pig tests">
+  <target name="test" depends="build-pig-tests, jar, opennlp-model-download" description="Runs
the pig tests">
     <taskdef resource="testngtasks" classpath="${tools.lib.dir}/testng-jar-${testng.version}.jar"/>
     <testng classpathref="run-tests-classpath" methods="${test.methods}"
             outputDir="${report.dir}" verbose="2" haltonfailure="true" haltonskipped="true">
@@ -398,7 +408,7 @@
     </testng>
   </target>
   
-  <target name="test-instrumented" depends="build-pig-tests, jar-instrumented" description="Runs
the tests with instrumented JARs">
+  <target name="test-instrumented" depends="build-pig-tests, jar-instrumented, opennlp-model-download"
description="Runs the tests with instrumented JARs">
     <taskdef resource="testngtasks" classpath="${tools.lib.dir}/testng-jar-${testng.version}.jar"/>
     <testng classpathref="instrumented-test-classpath" methods="${test.methods}"
             outputDir="${report.dir}" haltonfailure="true" haltonskipped="true">

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/ivy.xml
----------------------------------------------------------------------
diff --git a/ivy.xml b/ivy.xml
index 28a99fe..f1f90fa 100644
--- a/ivy.xml
+++ b/ivy.xml
@@ -19,10 +19,13 @@
             <!-- don't include fastutil, as we will include it below and don't want it
in the common directory -->
             <exclude org="it.unimi.dsi" name="fastutil" />
         </dependency>
-        <dependency org="com.google.guava" name="guava" rev="${guava.version}" conf="packaged->default"/>
      
+        <dependency org="com.google.guava" name="guava" rev="${guava.version}" conf="packaged->default"/>
+        <dependency org="org.apache.opennlp" name="opennlp-tools" rev="${opennlp.version}"
conf="packaged->default"/>
+        <dependency org="org.apache.opennlp" name="opennlp-uima" rev="${opennlp.version}"
conf="packaged->default"/>
+        <dependency org="org.apache.opennlp" name="opennlp-maxent" rev="${opennlp-maxent.version}"
conf="packaged->default"/>
 
         <!-- hadoop and pig dependencies required for building but which are not included
in the pom because
-             we don't want to require a specific version -->
+we don't want to require a specific version -->
         <dependency org="org.apache.pig" name="pig" rev="${pig.version}" conf="hadoop->default"/>
         <dependency org="org.apache.hadoop" name="hadoop-core" rev="${hadoop.version}"
conf="hadoop->default"/>
 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/ivy/libraries.properties
----------------------------------------------------------------------
diff --git a/ivy/libraries.properties b/ivy/libraries.properties
index d296101..55a217e 100644
--- a/ivy/libraries.properties
+++ b/ivy/libraries.properties
@@ -14,3 +14,5 @@ pig.version=0.11.1
 testng.version=6.2
 tools.version=1.4.2
 wagon-http.version=1.0-beta-2
+opennlp.version=1.5.3
+opennlp-maxent.version=3.0.3

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/CachedFile.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/CachedFile.java b/src/java/datafu/pig/text/opennlp/CachedFile.java
new file mode 100644
index 0000000..5832c81
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/CachedFile.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+public class CachedFile {
+
+    public static String getFileName(String modelLink, String modelFile) throws IOException
{
+        // if the symlink exists, use it, if not, use the raw name if it exists
+        // note: this is to help with testing, as it seems distributed cache doesn't work
with PigUnit
+        String loadFile = modelFile;
+        if (!new File(loadFile).exists()) {
+            if (new File(modelLink).exists()) {
+                loadFile = modelLink;
+            } else {
+                throw new IOException(String.format("Could not load model, neither symlink
%s nor file %s exist", modelFile, modelLink));
+            }
+        }
+        return loadFile;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/POSTag.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/POSTag.java b/src/java/datafu/pig/text/opennlp/POSTag.java
new file mode 100644
index 0000000..fb17c63
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/POSTag.java
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTaggerME;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP POSTag UDF tags bags of sequential words with parts of speech and confidence
levels using the OpenNLP
+ * toolset, and specifically the POSTaggerME class.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+ * define POSTag datafu.pig.text.opennlp.POSTag('data/en-pos-maxent.bin');
+ *
+ * -- input:
+ * -- (Appetizers during happy hour range from low to high.)
+ * input = LOAD 'input' AS (text:chararray);
+ * --
+ * -- ({(Appetizers),(during),(happy),(hour),(range),(from),(low),(to),(high),(.)})
+ * tokenized = FOREACH input GENERATE TokenizeME(text) AS tokens;
+ * --
+ * -- output:
+ * -- Tuple schema is: (word, tag, confidence)
+ * -- ({(Appetizers,NNP,0.3619277937390988),(during,IN,0.7945543860326094),(happy,JJ,0.9888504792754391),
+ * -- (hour,NN,0.9427455123502427),(range,NN,0.7335527963654751),(from,IN,0.9911576465589752),(low,JJ,0.9652034031895174),
+ * -- (to,IN,0.7005347487371849),(high,JJ,0.8227771746247106),(.,.,0.9900983495480891)})
+ * output = FOREACH tokenized GENERATE POSTag(tokens) AS tagged;
+ * }
+ * </pre>
+ */
+public class POSTag extends EvalFunc<DataBag>
+{
+    private POSTaggerME tagger = null;
+    private static final String MODEL_FILE = "pos";
+    private TupleFactory tf = TupleFactory.getInstance();
+    private BagFactory bf = BagFactory.getInstance();
+    private String modelPath;
+
+    public POSTag(String modelPath) {
+        this.modelPath = modelPath;
+    }
+
+    @Override
+    public List<String> getCacheFiles() {
+        List<String> list = new ArrayList<String>(1);
+        list.add(this.modelPath + "#" + MODEL_FILE);
+        return list;
+    }
+
+    // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+    public DataBag exec(Tuple input) throws IOException
+    {
+        DataBag inputBag = null;
+
+        if(input.size() != 1) {
+            throw new IOException();
+        }
+
+        inputBag = (DataBag)input.get(0);
+        DataBag outBag = bf.newDefaultBag();
+        if(this.tagger == null) {
+            String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);
+            InputStream modelIn = new FileInputStream(loadFile);
+            InputStream buffer = new BufferedInputStream(modelIn);
+            POSModel model = new POSModel(buffer);
+            this.tagger = new POSTaggerME(model);
+        }
+
+        // Form an inputString array thing for tagger to act on
+        int bagLength = (int)inputBag.size();
+        String[] words = new String[bagLength];
+
+        Iterator<Tuple> itr = inputBag.iterator();
+        int i = 0;
+        while(itr.hasNext()) {
+            words[i] = (String)itr.next().get(0);
+            i++;
+        }
+
+        // Compute tags and their probabilities
+        String tags[] = this.tagger.tag(words);
+        double probs[] = this.tagger.probs();
+
+        // Build output bag of 3-tuples
+        for(int j = 0; j < tags.length; j++) {
+            Tuple newTuple = tf.newTuple(3);
+            newTuple.set(0, words[j]);
+            newTuple.set(1, tags[j]);
+            newTuple.set(2, probs[j]);
+            outBag.add(newTuple);
+        }
+
+        return outBag;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input)
+    {
+        try
+        {
+            Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+            if (inputFieldSchema.type != DataType.BAG)
+            {
+                throw new RuntimeException("Expected a BAG as input");
+            }
+
+            Schema inputBagSchema = inputFieldSchema.schema;
+
+            if(inputBagSchema == null) {
+                return null;
+            }
+
+            if (inputBagSchema.getField(0).type != DataType.TUPLE)
+            {
+                throw new RuntimeException(String.format("Expected input bag to contain a
TUPLE, but instead found %s",
+                        DataType.findTypeName(inputBagSchema.getField(0).type)));
+            }
+
+            Schema inputTupleSchema = inputBagSchema.getField(0).schema;
+
+            if (inputTupleSchema.size() != 1)
+            {
+                throw new RuntimeException("Expected one field for the token data");
+            }
+
+            if (inputTupleSchema.getField(0).type != DataType.CHARARRAY)
+            {
+                throw new RuntimeException(String.format("Expected source to be a CHARARRAY,
but instead found %s",
+                        DataType.findTypeName(inputTupleSchema.getField(0).type)));
+            }
+
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema("tag",DataType.CHARARRAY));
+            tupleSchema.add(new Schema.FieldSchema("probability",DataType.DOUBLE));
+
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+                    .getName()
+                    .toLowerCase(), input),
+                    tupleSchema,
+                    DataType.BAG));
+        }
+        catch (FrontendException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/SentenceDetect.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/SentenceDetect.java b/src/java/datafu/pig/text/opennlp/SentenceDetect.java
new file mode 100644
index 0000000..50537fd
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/SentenceDetect.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP SentenceDectectors segment an input paragraph into sentences.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define SentenceDetect datafu.pig.text.SentenceDetect('data/en-sent.bin');
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC. I believe laser beams control
cat brains.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I believe the Masons have infiltrated the Apache PMC.)(I believe laser beams control
cat brains.)})
+ * outfoo = FOREACH input GENERATE SentenceDetect(text) as sentences;
+ * }
+ * </pre>
+ */
+public class SentenceDetect extends EvalFunc<DataBag>
+{
+    private SentenceDetectorME sdetector = null;
+    private static final String MODEL_FILE = "sentences";
+    private TupleFactory tf = TupleFactory.getInstance();
+    private BagFactory bf = BagFactory.getInstance();
+    private String modelPath = null;
+
+    public SentenceDetect(String modelPath) {
+        this.modelPath = modelPath;
+    }
+
+    @Override
+    public List<String> getCacheFiles() {
+        List<String> list = new ArrayList<String>(1);
+        list.add(this.modelPath + "#" + MODEL_FILE);
+        return list;
+    }
+
+    // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+    public DataBag exec(Tuple input) throws IOException
+    {
+        if(input.size() != 1) {
+            throw new IOException();
+        }
+
+        String inputString = input.get(0).toString();
+        if(inputString == null || inputString == "") {
+            return null;
+        }
+        DataBag outBag = bf.newDefaultBag();
+        if(sdetector == null) {
+            String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);
+            InputStream is = new FileInputStream(modelPath);
+            InputStream buffer = new BufferedInputStream(is);
+            SentenceModel model = new SentenceModel(buffer);
+            this.sdetector = new SentenceDetectorME(model);
+        }
+        String sentences[] = this.sdetector.sentDetect(inputString);
+        for(String sentence : sentences) {
+            Tuple outTuple = tf.newTuple(sentence);
+            outBag.add(outTuple);
+        }
+        return outBag;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input)
+    {
+        try
+        {
+            Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+            if (inputFieldSchema.type != DataType.CHARARRAY)
+            {
+                throw new RuntimeException("Expected a CHARARRAY as input, but got a " +
inputFieldSchema.toString());
+            }
+
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema("sentence",DataType.CHARARRAY));
+
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+                    .getName()
+                    .toLowerCase(), input),
+                    tupleSchema,
+                    DataType.BAG));
+        }
+        catch (FrontendException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeME.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeME.java b/src/java/datafu/pig/text/opennlp/TokenizeME.java
new file mode 100644
index 0000000..f1f4257
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeME.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens using the OpenNLP
TokenizeME class, which is
+ * a probabilistic, 'maximum entropy' classifier.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeME(text) as tokens;
+ * }
+ * </pre>
+ */
+
+
+
+public class TokenizeME extends EvalFunc<DataBag>
+{
+    private TokenizerME tokenizer = null;
+    private static final String MODEL_FILE = "tokens";
+    private TupleFactory tf = TupleFactory.getInstance();
+    private BagFactory bf = BagFactory.getInstance();
+    private String modelPath;
+
+    public TokenizeME(String modelPath) {
+        this.modelPath = modelPath;
+    }
+
+    @Override
+    public List<String> getCacheFiles() {
+        List<String> list = new ArrayList<String>(1);
+        list.add(this.modelPath + "#" + MODEL_FILE);
+        return list;
+    }
+
+    // Enable multiple languages by specifying the model path. See http://text.sourceforge.net/models-1.5/
+    public DataBag exec(Tuple input) throws IOException
+    {
+        if(input.size() != 1) {
+            throw new IOException();
+        }
+
+        String inputString = input.get(0).toString();
+        if(inputString == null || inputString == "") {
+            return null;
+        }
+        DataBag outBag = bf.newDefaultBag();
+        if(this.tokenizer == null) {
+            String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);;
+            InputStream file = new FileInputStream(loadFile);
+            InputStream buffer = new BufferedInputStream(file);
+            TokenizerModel model = new TokenizerModel(buffer);
+            this.tokenizer = new TokenizerME(model);
+        }
+        String tokens[] = this.tokenizer.tokenize(inputString);
+        for(String token : tokens) {
+            Tuple outTuple = tf.newTuple(token);
+            outBag.add(outTuple);
+        }
+        return outBag;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input)
+    {
+        try
+        {
+            Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+            if (inputFieldSchema.type != DataType.CHARARRAY)
+            {
+                throw new RuntimeException("Expected a CHARARRAY as input, but got a " +
inputFieldSchema.toString());
+            }
+
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+                    .getName()
+                    .toLowerCase(), input),
+                    tupleSchema,
+                    DataType.BAG));
+        }
+        catch (FrontendException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeSimple.java b/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
new file mode 100644
index 0000000..cea48b4
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeSimple.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.tokenize.SimpleTokenizer;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens. This one uses
the OpenNLP class SimpleTokenizer
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeSimple(text) as tokens;
+ * }
+ * </pre>
+ */
+public class TokenizeSimple extends EvalFunc<DataBag>
+{
+    private SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+    private TupleFactory tf = TupleFactory.getInstance();
+    private BagFactory bf = BagFactory.getInstance();
+
+    public DataBag exec(Tuple input) throws IOException
+    {
+        if(input.size() != 1) {
+            throw new IOException();
+        }
+
+        String inputString = input.get(0).toString();
+        if(inputString == null || inputString == "") {
+            return null;
+        }
+
+        DataBag outBag = bf.newDefaultBag();
+        String tokens[] = tokenizer.tokenize(inputString);
+        for(String token : tokens) {
+            Tuple outTuple = tf.newTuple(token);
+            outBag.add(outTuple);
+        }
+        return outBag;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input)
+    {
+        try
+        {
+            Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+            if (inputFieldSchema.type != DataType.CHARARRAY)
+            {
+                throw new RuntimeException("Expected a CHARARRAY as input, but got a " +
inputFieldSchema.toString());
+            }
+
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+                    .getName()
+                    .toLowerCase(), input),
+                    tupleSchema,
+                    DataType.BAG));
+        }
+        catch (FrontendException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java b/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
new file mode 100644
index 0000000..8efafb0
--- /dev/null
+++ b/src/java/datafu/pig/text/opennlp/TokenizeWhitespace.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package datafu.pig.text.opennlp;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import org.apache.pig.EvalFunc;
+import org.apache.pig.data.*;
+import org.apache.pig.impl.logicalLayer.FrontendException;
+import org.apache.pig.impl.logicalLayer.schema.Schema;
+
+/**
+ * The OpenNLP Tokenizers segment an input character sequence into tokens. This one uses
the OpenNLP class
+ * WhitespaceTokenizer.
+ * <p>
+ * Example:
+ * <pre>
+ * {@code
+ * define TokenizeWhitespace datafu.pig.text.opennlp.TokenizeWhitespace();
+ *
+ * -- input:
+ * -- ("I believe the Masons have infiltrated the Apache PMC.")
+ * infoo = LOAD 'input' AS (text:chararray);
+
+ * -- output:
+ * -- ({(I),(believe),(the),(Masons),(have),(infiltrated),(the),(Apache),(PMC),(.)})
+ * outfoo = FOREACH input GENERATE TokenizeWhitespace(text) as tokens;
+ * }
+ * </pre>
+ */
+public class TokenizeWhitespace extends EvalFunc<DataBag>
+{
+    private WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+    private TupleFactory tf = TupleFactory.getInstance();
+    private BagFactory bf = BagFactory.getInstance();
+
+    public DataBag exec(Tuple input) throws IOException
+    {
+        if(input.size() != 1) {
+            throw new IOException();
+        }
+
+        String inputString = input.get(0).toString();
+        if(inputString == null || inputString == "") {
+            return null;
+        }
+
+        DataBag outBag = bf.newDefaultBag();
+        String tokens[] = tokenizer.tokenize(inputString);
+        for(String token : tokens) {
+            Tuple outTuple = tf.newTuple(token);
+            outBag.add(outTuple);
+        }
+        return outBag;
+    }
+
+    @Override
+    public Schema outputSchema(Schema input)
+    {
+        try
+        {
+            Schema.FieldSchema inputFieldSchema = input.getField(0);
+
+            if (inputFieldSchema.type != DataType.CHARARRAY)
+            {
+                throw new RuntimeException("Expected a CHARARRAY as input, but got a " +
inputFieldSchema.toString());
+            }
+
+            Schema tupleSchema = new Schema();
+            tupleSchema.add(new Schema.FieldSchema("token",DataType.CHARARRAY));
+
+            return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass()
+                    .getName()
+                    .toLowerCase(), input),
+                    tupleSchema,
+                    DataType.BAG));
+        }
+        catch (FrontendException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/99e46e2c/test/pig/datafu/test/pig/text/NLPTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/text/NLPTests.java b/test/pig/datafu/test/pig/text/NLPTests.java
new file mode 100644
index 0000000..372b17d
--- /dev/null
+++ b/test/pig/datafu/test/pig/text/NLPTests.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.text;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+public class NLPTests extends PigTests
+{
+    /**
+     register $JAR_PATH
+
+     define SentenceDetect datafu.pig.text.opennlp.SentenceDetect('data/en-sent.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE SentenceDetect(text) AS sentences;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String sentenceDetectTest;
+
+    @Test
+    public void sentenceDetectTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(sentenceDetectTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This is a sentence.),(This is another sentence.)})",
+                "({(Yet another sentence.),(One more just for luck.)})");
+    }
+
+    /**
+     register $JAR_PATH
+
+     define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeMETest;
+
+    @Test
+    public void tokenizeMETest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeMETest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+                "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+    }
+
+    /**
+     register $JAR_PATH
+
+     define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeSimple(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeSimpleTest;
+
+    @Test
+    public void tokenizeSimpleTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeSimpleTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+                "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+    }
+
+    /**
+     register $JAR_PATH
+
+     define TokenizeWhitespace datafu.pig.text.opennlp.TokenizeWhitespace();
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeWhitespace(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeWhitespaceTest;
+
+    @Test
+    public void tokenizeWhitespaceTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeWhitespaceTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence.),(This),(is),(another),(sentence.)})",
+                "({(Yet),(another),(sentence.),(One),(more),(just),(for),(luck.)})");
+    }
+
+    /**
+     register $JAR_PATH
+
+     define TokenizeME datafu.pig.text.opennlp.TokenizeME('data/en-token.bin');
+     define POSTag datafu.pig.text.opennlp.POSTag('data/en-pos-maxent.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+     dump data2;
+
+     data3 = FOREACH data2 GENERATE POSTag(tokens) as tagged;
+
+     dump data3
+
+     STORE data3 INTO 'output';
+     */
+    @Multiline
+    private String POSTagTest;
+
+    @Test
+    public void POSTagTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(POSTagTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data3",
+                "({(This,DT,0.9649410482478001),(is,VBZ,0.9982592902509803),(a,DT,0.9967282012835504),(sentence,NN,0.9772619256460584),(.,.,0.4391067883074289),(This,DT,0.8346710130761914),(is,VBZ,0.9928885242823617),(another,DT,0.9761159923140399),(sentence,NN,0.9964463493238542),(.,.,0.9856037689871404)})",
+                "({(Yet,RB,0.7638997090011364),(another,DT,0.9657669183153523),(sentence,NN,0.989193114719676),(.,.,0.20091718589945456),(One,CD,0.9229251494813668),(more,JJR,0.9360382000551335),(just,RB,0.8646324491545225),(for,IN,0.9851765355889605),(luck,NN,0.9883408827371651),(.,.,0.9746378518791978)})");
+    }
+}


Mime
View raw message