datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From wvaug...@apache.org
Subject [11/19] DATAFU-27 Migrate build system to Gradle
Date Tue, 04 Mar 2014 07:09:29 GMT
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/VARTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/VARTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/VARTests.java
new file mode 100644
index 0000000..cd59504
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/VARTests.java
@@ -0,0 +1,521 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.pigunit.PigTest;
+import org.junit.Assert;
+import org.testng.annotations.Test;
+
+import datafu.pig.stats.DoubleVAR;
+import datafu.pig.stats.FloatVAR;
+import datafu.pig.stats.IntVAR;
+import datafu.pig.stats.LongVAR;
+import datafu.pig.stats.VAR;
+import datafu.test.pig.PigTests;
+
+public class VARTests  extends PigTests
+{
+  /**
+  
+
+  define VAR datafu.pig.stats.VAR();
+  
+  data_in = LOAD 'input' as (val:$VAL_TYPE);
+  data_out = GROUP data_in ALL;
+  data_out = FOREACH data_out GENERATE VAR(data_in.val) AS variance; 
+  
+  --describe data_out;
+  STORE data_out into 'output';
+   */
+  @Multiline private String varTest;
+  
+  @Test
+  public void varTestByteArray() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+                                           "VAL_TYPE=bytearray");
+
+    String[] input = {"1","2","3","4","10","5","6","7","8","9"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varTestDouble() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=double");
+
+    String[] input = {"1.0","2.0","3.0","4.0","10.0","5.0","6.0","7.0","8.0","9.0"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varTestFloat() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=float");
+
+    String[] input = {"1.0","2.0","3.0","4.0","10.0","5.0","6.0","7.0","8.0","9.0"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varTestInteger() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=int");
+
+    String[] input = {"1","2","3","4","10","5","6","7","8","9"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varTestLong() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=long");
+
+    String[] input = {"1","2","3","4","10","5","6","7","8","9"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+
+  @Test
+  public void varTestOneData() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=double");
+
+    String[] input = {"5.0"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(0.0)");
+  }
+  
+  
+  @Test
+  public void varTestZeroData() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=long");
+
+    String[] input = {};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),0);
+  }
+  
+  @Test
+  public void varTestNullEntry() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=double");
+
+    String[] input = {"1","2","3","4","10","5","6","7","8","9","null"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varTestNullEntries() throws Exception
+  {
+    PigTest test = createPigTestFromString(varTest,
+        "VAL_TYPE=float");
+
+    String[] input = {"1","2","3","4","10","5","6","7","8","9","null","null"};
+    writeLinesToFile("input", input);
+        
+    test.runScript();
+    
+    List<Tuple> output = getLinesForAlias(test, "data_out", true);
+    
+    assertEquals(output.size(),1);
+    assertEquals(output.get(0).toString(), "(8.25)");
+  }
+  
+  @Test
+  public void varExecTest() throws Exception {
+    DoubleVAR var = new DoubleVAR();
+    
+    DataBag bag;
+    Tuple input;
+    Double result;
+    
+    bag = BagFactory.getInstance().newDefaultBag();
+    for (int i=1; i<=1000; i++)
+    {
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      bag.add(t);
+    }
+    
+    input = TupleFactory.getInstance().newTuple(1);
+    input.set(0, bag);
+    
+    result = var.exec(input);
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+    
+    // do it again to check cleanup
+        
+    bag = BagFactory.getInstance().newDefaultBag();
+    for (int i=1; i<=2000; i++)
+    {
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      bag.add(t);
+    }
+    
+    input = TupleFactory.getInstance().newTuple(1);
+    input.set(0, bag);
+    
+    result = var.exec(input);
+    Assert.assertTrue("Expected about 333333.2 but found " + result,Math.abs(333333.2 - result) < 1);
+  }
+  
+  @Test
+  public void varAccumulateTest() throws Exception {
+    DoubleVAR var = new DoubleVAR();
+    
+    Double result;   
+    
+    for (int i=1; i<=1000; i++)
+    {
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      DataBag bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(1);
+      input.set(0, bag);
+      var.accumulate(input);
+    }
+    
+    result = var.getValue();
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+    
+    // do it again to check cleanup
+    var.cleanup();
+        
+    for (int i=1; i<=2000; i++)
+    {
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      DataBag bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(1);
+      input.set(0, bag);
+      var.accumulate(input);
+    }
+    
+    result = var.getValue();
+    Assert.assertTrue("Expected about 333333.2 but found " + result,Math.abs(333333.2 - result) < 1);
+  }
+  
+  // make sure intermediate works, where initial just passes through a single tuple, and intermediate receives a large bag of the resulting tuples
+  @Test
+  public void varDoubleAlgebraicIntermediateTest() throws Exception {
+    DoubleVAR.Initial initialVar = new DoubleVAR.Initial();
+    DoubleVAR.Intermediate intermediateVar = new DoubleVAR.Intermediate();
+    DoubleVAR.Final finalVar = new DoubleVAR.Final();
+    
+    
+    DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      intermediateBag.add(intermediateTuple);
+    }
+           
+    Tuple intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));  
+    intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure final works, where initial just passes through a single tuple, intermediate does the same, and final receives the remainder
+  @Test
+  public void varDoubleAlgebraicFinalTest() throws Exception {
+    DoubleVAR.Initial initialVar = new DoubleVAR.Initial();
+    DoubleVAR.Intermediate intermediateVar = new DoubleVAR.Intermediate();
+    DoubleVAR.Final finalVar = new DoubleVAR.Final();
+    
+    DataBag finalBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (double)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+      intermediateBag.add(intermediateTuple);
+      intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag)); 
+      finalBag.add(intermediateTuple);
+    }
+     
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(finalBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure intermediate works, where initial just passes through a single tuple, and intermediate receives a large bag of the resulting tuples
+  @Test
+  public void varFloatAlgebraicIntermediateTest() throws Exception {
+    FloatVAR.Initial initialVar = new FloatVAR.Initial();
+    FloatVAR.Intermediate intermediateVar = new FloatVAR.Intermediate();
+    FloatVAR.Final finalVar = new FloatVAR.Final();
+    
+    
+    DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (float)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      intermediateBag.add(intermediateTuple);
+    }
+           
+    Tuple intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));  
+    intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure final works, where initial just passes through a single tuple, intermediate does the same, and final receives the remainder
+  @Test
+  public void varFloatAlgebraicFinalTest() throws Exception {
+    FloatVAR.Initial initialVar = new FloatVAR.Initial();
+    FloatVAR.Intermediate intermediateVar = new FloatVAR.Intermediate();
+    FloatVAR.Final finalVar = new FloatVAR.Final();
+    
+    DataBag finalBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (float)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+      intermediateBag.add(intermediateTuple);
+      intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag)); 
+      finalBag.add(intermediateTuple);
+    }
+     
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(finalBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure intermediate works, where initial just passes through a single tuple, and intermediate receives a large bag of the resulting tuples
+  @Test
+  public void varIntAlgebraicIntermediateTest() throws Exception {
+    IntVAR.Initial initialVar = new IntVAR.Initial();
+    IntVAR.Intermediate intermediateVar = new IntVAR.Intermediate();
+    IntVAR.Final finalVar = new IntVAR.Final();
+    
+    
+    DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (int)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      intermediateBag.add(intermediateTuple);
+    }
+           
+    Tuple intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));  
+    intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure final works, where initial just passes through a single tuple, intermediate does the same, and final receives the remainder
+  @Test
+  public void varIntAlgebraicFinalTest() throws Exception {
+    IntVAR.Initial initialVar = new IntVAR.Initial();
+    IntVAR.Intermediate intermediateVar = new IntVAR.Intermediate();
+    IntVAR.Final finalVar = new IntVAR.Final();
+    
+    DataBag finalBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (int)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+      intermediateBag.add(intermediateTuple);
+      intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag)); 
+      finalBag.add(intermediateTuple);
+    }
+     
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(finalBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure intermediate works, where initial just passes through a single tuple, and intermediate receives a large bag of the resulting tuples
+  @Test
+  public void varLongAlgebraicIntermediateTest() throws Exception {
+    LongVAR.Initial initialVar = new LongVAR.Initial();
+    LongVAR.Intermediate intermediateVar = new LongVAR.Intermediate();
+    LongVAR.Final finalVar = new LongVAR.Final();
+    
+    
+    DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (long)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      intermediateBag.add(intermediateTuple);
+    }
+           
+    Tuple intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));  
+    intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(intermediateBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+  
+  // make sure final works, where initial just passes through a single tuple, intermediate does the same, and final receives the remainder
+  @Test
+  public void varLongAlgebraicFinalTest() throws Exception {
+    LongVAR.Initial initialVar = new LongVAR.Initial();
+    LongVAR.Intermediate intermediateVar = new LongVAR.Intermediate();
+    LongVAR.Final finalVar = new LongVAR.Final();
+    
+    DataBag finalBag = BagFactory.getInstance().newDefaultBag();
+    
+    for (int i=1; i<=1000; i++)
+    {
+      DataBag bag;
+      Tuple t = TupleFactory.getInstance().newTuple(1);
+      t.set(0, (long)i);
+      bag = BagFactory.getInstance().newDefaultBag();
+      bag.add(t);
+      Tuple input = TupleFactory.getInstance().newTuple(bag);
+      Tuple intermediateTuple = initialVar.exec(input);
+      DataBag intermediateBag = BagFactory.getInstance().newDefaultBag();
+      intermediateBag.add(intermediateTuple);
+      intermediateTuple = intermediateVar.exec(TupleFactory.getInstance().newTuple(intermediateBag)); 
+      finalBag.add(intermediateTuple);
+    }
+     
+    Double result = finalVar.exec(TupleFactory.getInstance().newTuple(finalBag));
+    
+    Assert.assertTrue("Expected about 83333.25 but found " + result,Math.abs(83333.25 - result) < 0.0001);
+  }
+ }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/WilsonBinConfTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/WilsonBinConfTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/WilsonBinConfTests.java
new file mode 100644
index 0000000..b888595
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/WilsonBinConfTests.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+public class WilsonBinConfTests extends PigTests
+{
+  /**
+  
+
+  define WilsonBinConf datafu.pig.stats.WilsonBinConf('$alpha');
+  
+  data = load 'input' as (successes:long, totals:long);
+  --describe data;
+  
+  data_out = FOREACH data GENERATE WilsonBinConf(successes, totals) as interval;
+  data_out = FOREACH data_out GENERATE FLATTEN(interval);
+  
+  store data_out into 'output';
+   */
+  @Multiline private String wilsonBinConf;
+  
+  @Test
+  public void wilsonTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(wilsonBinConf,
+                                 "alpha=0.05"); // alpha is 0.05 for 95% confidence
+    
+    writeLinesToFile("input",
+                     "1\t1",
+                     "1\t2",
+                     "50\t100",
+                     "500\t1000",
+                     "999\t1000",
+                     "1000\t1000",
+                     "998\t1000");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * library(Hmisc)
+     * 
+     * binconf(50,100)
+     * binconf(500,1000)
+     * 
+     */
+    List<String> expectedOutput = new ArrayList<String>();
+    expectedOutput.add("0.05129,1.00000");
+    expectedOutput.add("0.02565,0.97435");
+    expectedOutput.add("0.40383,0.59617");
+    expectedOutput.add("0.46907,0.53093");
+    expectedOutput.add("0.99436,0.99995");
+    expectedOutput.add("0.99617,1.00000");
+    expectedOutput.add("0.99274,0.99945");
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    Iterator<String> expectationIterator = expectedOutput.iterator();
+    for (Tuple t : output)
+    {
+      assertTrue(expectationIterator.hasNext());
+      Double lower = (Double)t.get(0);
+      Double upper = (Double)t.get(1);
+      assertEquals(String.format("%.5f,%.5f",lower,upper),expectationIterator.next());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/AbstractEntropyTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/AbstractEntropyTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/AbstractEntropyTests.java
new file mode 100644
index 0000000..6e512d4
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/AbstractEntropyTests.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.pig.data.Tuple;
+
+import org.apache.pig.backend.executionengine.ExecException;
+
+import datafu.test.pig.PigTests;
+
+public abstract class AbstractEntropyTests extends PigTests
+{
+  protected void verifyEqualEntropyOutput(List<Double> expectedOutput, List<Tuple> output, int digits) throws ExecException {
+    assertEquals(expectedOutput.size(), output.size());
+    Iterator<Double> expectationIterator = expectedOutput.iterator();
+    String formatDigits = "%." + digits + "f";
+    for (Tuple t : output)
+    {
+      Double entropy = (Double)t.get(0);
+      assertEquals(String.format(formatDigits,entropy),String.format(formatDigits, expectationIterator.next()));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/ChaoShenEntropyTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/ChaoShenEntropyTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/ChaoShenEntropyTests.java
new file mode 100644
index 0000000..c20eada
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/ChaoShenEntropyTests.java
@@ -0,0 +1,371 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark
+ */
+public class ChaoShenEntropyTests extends AbstractEntropyTests
+{
+  /**
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$type','$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+
+  @Test  
+  public void uniqValChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 4.816221
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(4.816221);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void singleValChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 2.57429 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.57429);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+
+  @Test  
+  public void emptyInputBagChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void singleElemInputBagChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$type','$base');
+
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY x,y;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String pairEntropy;
+ 
+  @Test  
+  public void dupPairValChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairEntropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 2.57429 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.57429);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValChaoShenEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log2"))
+     * [1] 3.713915 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(3.713915);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValChaoShenEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log10"))
+     * [1] 1.118 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.118);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/CondEntropyTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/CondEntropyTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/CondEntropyTests.java
new file mode 100644
index 0000000..0587ed1
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/CondEntropyTests.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+/*
+ * Use R function to compute condition entropy as the test benchmark
+ * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
+ */
+public class CondEntropyTests extends AbstractEntropyTests
+{
+  /**
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condEntropy;
+  
+  @Test
+  public void uniqValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+        
+    test.runScript();
+   
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
+     * Y=c("click","view","view","click","click","view","view","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
+     * Y=c("click","click","click","click","click","click","click","click","click","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+        
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0.3295837 
+     */    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void emptyInputBagEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleElemInputBagEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	view");
+
+    test.runScript();
+     /*
+     * library(infotheo)
+     * X = c("98.94791")
+     * Y = c("view")
+     * condentropy(Y,X)
+     * [1] 0
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
+  data = foreach data generate (valX1, valX2) as X, valY as Y;
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE Entropy(data_ordered);
+             };
+
+  store data_out into 'output';
+   */
+  @Multiline private String pairCondEntropy;
+ 
+  @Test
+  public void dupPairValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairCondEntropy);
+    
+    writeLinesToFile("input",
+                     "hadoop	bigdata	click",
+                     "hadoop	pig	view",
+                     "hadoop	datafu	click",
+                     "datafu	pig	click",
+                     "bigdata	pig	view",
+                     "datafu	pig	click",
+                     "datafu	pig	view",
+                     "hadoop	bigdata	view",
+                     "pig	datafu	view",
+                     "pig	datafu	view");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
+     * Y=c("click","view","click","click","view","click","view","view","view","view")
+     * condentropy(X,Y)
+     * [1] 0.3295837
+     */   
+ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy('$type','$base');
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condLogEntropy;
+ 
+  @Test
+  public void dupValEmpiricalCondEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+ 
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(2)
+     * [1] 0.4754888 
+     */       
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.4754888);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalCondEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+    
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(10)
+     * [1] 0.1431364 
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.1431364);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE CondEntropy(data);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String noOrderCondEntropy;
+  
+  @Test
+  public void noOrderEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(noOrderCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {}
+  }
+
+  /**
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputCondEntropy;
+ 
+  @Test
+  public void invalidInputEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
new file mode 100644
index 0000000..594b91b
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
@@ -0,0 +1,578 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark 
+ */
+public class EmpiricalCountEntropyTests extends AbstractEntropyTests
+{
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+  
+  @Test
+  public void uniqValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 2.302585
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.302585);
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void emptyInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleElemInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data BY (x, y);
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data);
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String pairLogEntropy;
+ 
+  @Test
+  public void dupPairValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String logEntropy;
+ 
+  @Test
+  public void dupValEntropyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log2"))
+     * [1] 2.646439
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.646439);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log10"))
+     * [1] 0.7966576
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.7966576);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (val:int);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawValidInputEntropy;
+ 
+  @Test
+  public void rawValidInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(count) 
+     * [1] 1.846901 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.846901);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (val:double);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawInvalidTypeInputEntropy;
+ 
+  @Test
+  public void rawInvalidTypeInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0.0",
+                     "38.0",
+                     "0.0",
+                     "62.0",
+                     "38.0",
+                     "32.001",
+                     "96.002",
+                     "38.01",
+                     "96.00001",
+                     "0.0");
+     try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+     } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
+     }
+  }
+
+  @Test
+  public void rawInValidInputValueEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "-38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(ifelse(count>0,count,0))
+     * [1] 1.693862 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.693862);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (f1:chararray, f2:chararray);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputSchemaEntropy;
+ 
+  @Test
+  public void invalidInputSchemaEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputSchemaEntropy); 
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370");
+        
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+    } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
+    }
+  }
+
+  /**
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped  {
+                          data_cnt_ordered = order data_cnt by *;
+                          GENERATE Entropy(data_cnt_ordered);
+                          }
+  store data_out into 'output';
+   */
+  @Multiline private String accumulatedEntropy;
+
+  @Test
+  public void accumulatedEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(accumulatedEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    //the same output as @test dupValEntropyTest
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EntropyTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EntropyTests.java b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EntropyTests.java
new file mode 100644
index 0000000..59fc3c2
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/stats/entropy/EntropyTests.java
@@ -0,0 +1,430 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark 
+ */
+public class EntropyTests extends AbstractEntropyTests
+{
+  /**
+  
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+  
+  @Test
+  public void uniqValEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 2.302585
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.302585);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleValEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void emptyInputBagEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleElemInputBagEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
+  
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY x,y;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String pairEntropy;
+ 
+  @Test
+  public void dupPairValEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String logEntropy;
+ 
+  @Test
+  public void dupValEmpiricalEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count, freqs, c("ML"), c("log2")) 
+     * [1] 2.646439 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.646439);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count, freqs, c("ML"), c("log10")) 
+     * [1] 0.7966576 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.7966576);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE Entropy(data);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String noOrderEntropy;
+ 
+  @Test
+  public void noOrderEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(noOrderEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "38.61010",
+                     "37.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/datafu-pig/src/test/java/datafu/test/pig/text/NLPTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/text/NLPTests.java b/datafu-pig/src/test/java/datafu/test/pig/text/NLPTests.java
new file mode 100644
index 0000000..0504b52
--- /dev/null
+++ b/datafu-pig/src/test/java/datafu/test/pig/text/NLPTests.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.text;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+public class NLPTests extends PigTests
+{
+    /**
+
+     define SentenceDetect datafu.pig.text.opennlp.SentenceDetect('$DATA_DIR/en-sent.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE SentenceDetect(text) AS sentences;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String sentenceDetectTest;
+
+    @Test
+    public void sentenceDetectTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(sentenceDetectTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This is a sentence.),(This is another sentence.)})",
+                "({(Yet another sentence.),(One more just for luck.)})");
+    }
+
+    /**
+
+     define TokenizeME datafu.pig.text.opennlp.TokenizeME('$DATA_DIR/en-token.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeMETest;
+
+    @Test
+    public void tokenizeMETest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeMETest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+                "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+    }
+
+    /**
+
+     define TokenizeSimple datafu.pig.text.opennlp.TokenizeSimple();
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeSimple(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeSimpleTest;
+
+    @Test
+    public void tokenizeSimpleTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeSimpleTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence),(.),(This),(is),(another),(sentence),(.)})",
+                "({(Yet),(another),(sentence),(.),(One),(more),(just),(for),(luck),(.)})");
+    }
+
+    /**
+
+     define TokenizeWhitespace datafu.pig.text.opennlp.TokenizeWhitespace();
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeWhitespace(text) AS tokens;
+
+     dump data2;
+
+     STORE data2 INTO 'output';
+     */
+    @Multiline
+    private String tokenizeWhitespaceTest;
+
+    @Test
+    public void tokenizeWhitespaceTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(tokenizeWhitespaceTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data2",
+                "({(This),(is),(a),(sentence.),(This),(is),(another),(sentence.)})",
+                "({(Yet),(another),(sentence.),(One),(more),(just),(for),(luck.)})");
+    }
+
+    /**
+
+     define TokenizeME datafu.pig.text.opennlp.TokenizeME('$DATA_DIR/en-token.bin');
+     define POSTag datafu.pig.text.opennlp.POSTag('$DATA_DIR/en-pos-maxent.bin');
+
+     data = LOAD 'input' AS (text: chararray);
+
+     dump data;
+
+     data2 = FOREACH data GENERATE TokenizeME(text) AS tokens;
+
+     dump data2;
+
+     data3 = FOREACH data2 GENERATE POSTag(tokens) as tagged;
+
+     dump data3
+
+     STORE data3 INTO 'output';
+     */
+    @Multiline
+    private String POSTagTest;
+
+    @Test
+    public void POSTagTest() throws Exception
+    {
+        PigTest test = createPigTestFromString(POSTagTest);
+
+        writeLinesToFile("input",
+                "This is a sentence. This is another sentence.",
+                "Yet another sentence. One more just for luck.");
+
+        assertOutput(test, "data3",
+                "({(This,DT,0.9649410482478001),(is,VBZ,0.9982592902509803),(a,DT,0.9967282012835504),(sentence,NN,0.9772619256460584),(.,.,0.4391067883074289),(This,DT,0.8346710130761914),(is,VBZ,0.9928885242823617),(another,DT,0.9761159923140399),(sentence,NN,0.9964463493238542),(.,.,0.9856037689871404)})",
+                "({(Yet,RB,0.7638997090011364),(another,DT,0.9657669183153523),(sentence,NN,0.989193114719676),(.,.,0.20091718589945456),(One,CD,0.9229251494813668),(more,JJR,0.9360382000551335),(just,RB,0.8646324491545225),(for,IN,0.9851765355889605),(luck,NN,0.9883408827371651),(.,.,0.9746378518791978)})");
+    }
+}


Mime
View raw message