datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject [1/2] DATAFU-26 update entropy naming conventions
Date Mon, 03 Mar 2014 18:32:26 GMT
Repository: incubator-datafu
Updated Branches:
  refs/heads/master b2134e660 -> 4aa2ef2a4


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
new file mode 100644
index 0000000..c6d62df
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+/*
+ * Use R function to compute condition entropy as the test benchmark
+ * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
+ */
+public class CondEntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condEntropy;
+  
+  @Test
+  public void uniqValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+        
+    test.runScript();
+   
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
+     * Y=c("click","view","view","click","click","view","view","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
+     * Y=c("click","click","click","click","click","click","click","click","click","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+        
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0.3295837 
+     */    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void emptyInputBagEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleElemInputBagEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	view");
+
+    test.runScript();
+     /*
+     * library(infotheo)
+     * X = c("98.94791")
+     * Y = c("view")
+     * condentropy(Y,X)
+     * [1] 0
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
+  data = foreach data generate (valX1, valX2) as X, valY as Y;
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE Entropy(data_ordered);
+             };
+
+  store data_out into 'output';
+   */
+  @Multiline private String pairCondEntropy;
+ 
+  @Test
+  public void dupPairValEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairCondEntropy);
+    
+    writeLinesToFile("input",
+                     "hadoop	bigdata	click",
+                     "hadoop	pig	view",
+                     "hadoop	datafu	click",
+                     "datafu	pig	click",
+                     "bigdata	pig	view",
+                     "datafu	pig	click",
+                     "datafu	pig	view",
+                     "hadoop	bigdata	view",
+                     "pig	datafu	view",
+                     "pig	datafu	view");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
+     * Y=c("click","view","click","click","view","click","view","view","view","view")
+     * condentropy(X,Y)
+     * [1] 0.3295837
+     */   
+ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy('$type','$base');
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condLogEntropy;
+ 
+  @Test
+  public void dupValEmpiricalCondEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+ 
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(2)
+     * [1] 0.4754888 
+     */       
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.4754888);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValEmpiricalCondEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+    
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(10)
+     * [1] 0.1431364 
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.1431364);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE CondEntropy(data);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String noOrderCondEntropy;
+  
+  @Test
+  public void noOrderEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(noOrderCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {}
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+  
+  data = load 'input' as (valX:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputCondEntropy;
+ 
+  @Test
+  public void invalidInputEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
new file mode 100644
index 0000000..52fdcce
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
@@ -0,0 +1,585 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark 
+ */
+public class EmpiricalCountEntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+  
+  @Test
+  public void uniqValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 2.302585
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.302585);
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void emptyInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleElemInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data BY (x, y);
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data);
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String pairLogEntropy;
+ 
+  @Test
+  public void dupPairValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String logEntropy;
+ 
+  @Test
+  public void dupValEntropyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log2"))
+     * [1] 2.646439
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.646439);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log10"))
+     * [1] 0.7966576
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.7966576);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (val:int);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawValidInputEntropy;
+ 
+  @Test
+  public void rawValidInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(count) 
+     * [1] 1.846901 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.846901);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (val:double);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawInvalidTypeInputEntropy;
+ 
+  @Test
+  public void rawInvalidTypeInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0.0",
+                     "38.0",
+                     "0.0",
+                     "62.0",
+                     "38.0",
+                     "32.001",
+                     "96.002",
+                     "38.01",
+                     "96.00001",
+                     "0.0");
+     try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+     } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
+     }
+  }
+
+  @Test
+  public void rawInValidInputValueEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "-38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(ifelse(count>0,count,0))
+     * [1] 1.693862 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.693862);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data_cnt = load 'input' as (f1:chararray, f2:chararray);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputSchemaEntropy;
+ 
+  @Test
+  public void invalidInputSchemaEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputSchemaEntropy); 
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370");
+        
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+    } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
+    }
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped  {
+                          data_cnt_ordered = order data_cnt by *;
+                          GENERATE Entropy(data_cnt_ordered);
+                          }
+  store data_out into 'output';
+   */
+  @Multiline private String accumulatedEntropy;
+
+  @Test
+  public void accumulatedEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(accumulatedEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    //the same output as @test dupValEntropyTest
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
index 8ef94c3..e1611a7 100644
--- a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
+++ b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
@@ -45,16 +45,17 @@ public class EntropyTests extends AbstractEntropyTests
   
   data = load 'input' as (val:double);
   --describe data;
-  data_grouped = GROUP data BY val;
-  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
   store data_out into 'output';
    */
   @Multiline private String entropy;
   
   @Test
-  public void uniqValEntropyTest() throws Exception
+  public void uniqValEmpiricalEntropoyTest() throws Exception
   {
     PigTest test = createPigTestFromString(entropy);
     
@@ -89,12 +90,13 @@ public class EntropyTests extends AbstractEntropyTests
      */
     List<Double> expectedOutput = new ArrayList<Double>();
     expectedOutput.add(2.302585);
+    
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   @Test
-  public void singleValEntropyTest() throws Exception
+  public void singleValEmpiricalEntropoyTest() throws Exception
   {
     PigTest test = createPigTestFromString(entropy);
     
@@ -131,11 +133,11 @@ public class EntropyTests extends AbstractEntropyTests
     expectedOutput.add(0.0);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   @Test
-  public void dupValEntropyTest() throws Exception
+  public void dupValEmpiricalEntropoyTest() throws Exception
   {
     PigTest test = createPigTestFromString(entropy);
     
@@ -172,11 +174,11 @@ public class EntropyTests extends AbstractEntropyTests
     expectedOutput.add(1.834372);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   @Test
-  public void emptyInputBagEntropyTest() throws Exception
+  public void emptyInputBagEmpiricalEntropoyTest() throws Exception
   {
     PigTest test = createPigTestFromString(entropy);
     
@@ -201,11 +203,11 @@ public class EntropyTests extends AbstractEntropyTests
     List<Double> expectedOutput = new ArrayList<Double>();
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   @Test
-  public void singleElemInputBagEntropyTest() throws Exception
+  public void singleElemInputBagEmpiricalEntropoyTest() throws Exception
   {
     PigTest test = createPigTestFromString(entropy);
     
@@ -228,28 +230,29 @@ public class EntropyTests extends AbstractEntropyTests
     expectedOutput.add(0.0);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   /**
   register $JAR_PATH
 
-  define Entropy datafu.pig.stats.entropy.Entropy('$base');
-
+  define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
+  
   data = load 'input' as (x:chararray, y:double);
   --describe data;
-  data_grouped = GROUP data BY (x, y);
-  data_cnt = FOREACH data_grouped GENERATE COUNT(data);
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY x,y;
+                     GENERATE Entropy(data_ordered);
+             };
   store data_out into 'output';
    */
-  @Multiline private String pairLogEntropy;
+  @Multiline private String pairEntropy;
  
   @Test
-  public void dupPairValEntropyTest() throws Exception
+  public void dupPairValEmpiricalEntropoyTest() throws Exception
   {
-    PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+    PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
     
     writeLinesToFile("input",
                      "hadoop	98.94791",
@@ -280,28 +283,29 @@ public class EntropyTests extends AbstractEntropyTests
     expectedOutput.add(1.834372);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   /**
   register $JAR_PATH
 
-  define Entropy datafu.pig.stats.entropy.Entropy('$base');
+  define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
   
   data = load 'input' as (val:double);
   --describe data;
-  data_grouped = GROUP data BY val;
-  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
   store data_out into 'output';
    */
   @Multiline private String logEntropy;
  
   @Test
-  public void dupValEntropyLog2Test() throws Exception
+  public void dupValEmpiricalEntropoyLog2Test() throws Exception
   {
-    PigTest test = createPigTestFromString(logEntropy, "base=log2");
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
     
     writeLinesToFile("input",
                      "98.94791",
@@ -324,25 +328,26 @@ public class EntropyTests extends AbstractEntropyTests
      * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
      * > table(v)
      * v
-     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
      * 1        1        3        1        2        1        1 
      * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
      * > library(entropy)
-     * > entropy(count, count/sum(count), c("ML"),c("log2"))
-     * [1] 2.646439
+     * > entropy(count, freqs, c("ML"), c("log2")) 
+     * [1] 2.646439 
      * 
      */
     List<Double> expectedOutput = new ArrayList<Double>();
     expectedOutput.add(2.646439);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   @Test
-  public void dupValEntropyLog10Test() throws Exception
+  public void dupValEmpiricalEntropoyLog10Test() throws Exception
   {
-    PigTest test = createPigTestFromString(logEntropy, "base=log10");
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
     
     writeLinesToFile("input",
                      "98.94791",
@@ -365,177 +370,20 @@ public class EntropyTests extends AbstractEntropyTests
      * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
      * > table(v)
      * v
-     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
      * 1        1        3        1        2        1        1 
      * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
      * > library(entropy)
-     * > entropy(count, count/sum(count), c("ML"),c("log10"))
-     * [1] 0.7966576
+     * > entropy(count, freqs, c("ML"), c("log10")) 
+     * [1] 0.7966576 
      * 
      */
     List<Double> expectedOutput = new ArrayList<Double>();
     expectedOutput.add(0.7966576);
     
     List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
-  }
-
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.Entropy();
-  
-  data_cnt = load 'input' as (val:int);
-  --describe data_cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
-  store data_out into 'output';
-   */
-  @Multiline private String rawValidInputEntropy;
- 
-  @Test
-  public void rawValidInputEntropyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(rawValidInputEntropy); 
-    
-    writeLinesToFile("input",
-                     "0",
-                     "38",
-                     "0",
-                     "62",
-                     "38",
-                     "32",
-                     "96",
-                     "38",
-                     "96",
-                     "0");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
-     * > library(entropy)
-     * > entropy(count) 
-     * [1] 1.846901 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(1.846901);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.Entropy();
-  
-  data_cnt = load 'input' as (val:double);
-  --describe data_cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
-  store data_out into 'output';
-   */
-  @Multiline private String rawInvalidTypeInputEntropy;
- 
-  @Test
-  public void rawInvalidTypeInputEntropyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy); 
-    
-    writeLinesToFile("input",
-                     "0.0",
-                     "38.0",
-                     "0.0",
-                     "62.0",
-                     "38.0",
-                     "32.001",
-                     "96.002",
-                     "38.01",
-                     "96.00001",
-                     "0.0");
-     try {
-         test.runScript();
-         List<Tuple> output = this.getLinesForAlias(test, "data_out");
-         fail( "Testcase should fail");    
-     } catch (Exception ex) {
-         assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
-     }
-  }
-
-  @Test
-  public void rawInValidInputValueEntropyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(rawValidInputEntropy); 
-    
-    writeLinesToFile("input",
-                     "0",
-                     "-38",
-                     "0",
-                     "62",
-                     "38",
-                     "32",
-                     "96",
-                     "38",
-                     "96",
-                     "0");
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
-     * > library(entropy)
-     * > entropy(ifelse(count>0,count,0))
-     * [1] 1.693862 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(1.693862);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
-
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.Entropy();
-  
-  data_cnt = load 'input' as (f1:chararray, f2:chararray);
-  --describe data_cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
-  store data_out into 'output';
-   */
-  @Multiline private String invalidInputSchemaEntropy;
- 
-  @Test
-  public void invalidInputSchemaEntropyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(invalidInputSchemaEntropy); 
-    
-    writeLinesToFile("input",
-                     "hadoop	98.94791",
-                     "bigdata	38.61010",
-                     "hadoop	97.10575",
-                     "datafu	32.05370",
-                     "bigdata	38.61010",
-                     "datafu	32.05370",
-                     "datafu	32.05370");
-        
-    try {
-         test.runScript();
-         List<Tuple> output = this.getLinesForAlias(test, "data_out");
-         fail( "Testcase should fail");    
-    } catch (Exception ex) {
-         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
-    }
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
   }
 
   /**
@@ -545,26 +393,24 @@ public class EntropyTests extends AbstractEntropyTests
   
   data = load 'input' as (val:double);
   --describe data;
-  data_grouped = GROUP data BY val;
-  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
-  data_cnt_grouped = GROUP data_cnt ALL;
-  data_out = FOREACH data_cnt_grouped  {
-                          data_cnt_ordered = order data_cnt by *;
-                          GENERATE Entropy(data_cnt_ordered);
-                          }
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE Entropy(data);
+             };
   store data_out into 'output';
    */
-  @Multiline private String accumulatedEntropy;
-
+  @Multiline private String noOrderEntropy;
+ 
   @Test
-  public void accumulatedEntropyTest() throws Exception
+  public void noOrderEmpiricalEntropoyTest() throws Exception
   {
-    PigTest test = createPigTestFromString(accumulatedEntropy); 
+    PigTest test = createPigTestFromString(noOrderEntropy);
     
     writeLinesToFile("input",
                      "98.94791",
                      "38.61010",
-                     "97.10575",
+                     "38.61010",
+                     "37.10575",
                      "62.28313",
                      "38.61010",
                      "32.05370",
@@ -572,14 +418,12 @@ public class EntropyTests extends AbstractEntropyTests
                      "38.61010",
                      "96.10962",
                      "20.41135");
-        
-    test.runScript();
-    
-    List<Double> expectedOutput = new ArrayList<Double>();
-    //the same output as @test dupValEntropyTest
-    expectedOutput.add(1.834372);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+    }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
deleted file mode 100644
index 77a8e8b..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-/*
- * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
- * used as our test benchmark
- */
-public class StreamingChaoShenEntropyTests extends AbstractEntropyTests
-{
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
-  
-  data = load 'input' as (val:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY val;
-                     GENERATE Entropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String entropy;
-
-  @Test  
-  public void uniqValStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.83960",
-                     "32.05370",
-                     "96.10962",
-                     "28.72388",
-                     "96.65888",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        1        1        1        1        1        1        1        1 
-     * > count=c(1,1,1,1,1,1,1,1,1,1)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 4.816221
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(4.816221);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test  
-  public void singleValStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
-     * > table(v)
-     * v
-     * 98.94791 
-     * 10 
-     * > count=(10)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 0 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test  
-  public void dupValStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 2.57429 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(2.57429);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-
-  @Test  
-  public void emptyInputBagStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-    
-    writeLinesToFile("input"
-                     );
-
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c() 
-     * > table(v)
-     * < table of extent 0 > 
-     * > count=c()
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 0 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test  
-  public void singleElemInputBagStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-    
-    writeLinesToFile("input",
-                     "98.94791");
-
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > count=c(1)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 0
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
-
-  data = load 'input' as (x:chararray, y:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY x,y;
-                     GENERATE Entropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String pairEntropy;
- 
-  @Test  
-  public void dupPairValStreamingChaoShenEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(pairEntropy, "type=chaosh", "base=log");
-    
-    writeLinesToFile("input",
-                     "hadoop	98.94791",
-                     "bigdata	38.61010",
-                     "hadoop	97.10575",
-                     "datafu	32.05370",
-                     "bigdata	38.61010",
-                     "datafu	32.05370",
-                     "datafu	32.05370",
-                     "hadoop	38.61010",
-                     "pig	96.10962",
-                     "pig	20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
-     * > t <- t[order(x,y)]
-     * > count<-c(2,3,1,1,1,1,1)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log"))
-     * [1] 2.57429 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(2.57429);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test  
-  public void dupValStreamingChaoShenEntropoyLog2Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log2");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > freqs=count/sum(count)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log2"))
-     * [1] 3.713915 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(3.713915);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test  
-  public void dupValStreamingChaoShenEntropoyLog10Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log10");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > library(entropy)
-     * > entropy(count,count/sum(count),c("CS"),c("log10"))
-     * [1] 1.118 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(1.118);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
deleted file mode 100644
index 151bf8b..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-
-/*
- * Use R function to compute condition entropy as the test benchmark
- * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
- */
-public class StreamingEmpiricalCondEntropyTests extends AbstractEntropyTests
-{
-  /**
-  register $JAR_PATH
-
-  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-  
-  data = load 'input' as (valX:double, valY:chararray);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY *;
-                     GENERATE CondEntropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String condEntropy;
-  
-  @Test
-  public void uniqValStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(condEntropy); 
-    
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "38.61010	view",
-                     "97.10575	view",
-                     "62.28313	click",
-                     "38.83960	click",
-                     "32.05370	view",
-                     "96.10962	view",
-                     "28.72388	click",
-                     "96.65888	view",
-                     "20.41135	click");
-        
-    test.runScript();
-   
-    /*
-     * library(infotheo)
-     * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
-     * Y=c("click","view","view","click","click","view","view","click","view","click")
-     * condentropy(Y,X)
-     * [1] 0
-     */ 
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void singleValStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(condEntropy);
-    
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click",
-                     "98.94791	click");
-        
-    test.runScript();
-
-    /*
-     * library(infotheo)
-     * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
-     * Y=c("click","click","click","click","click","click","click","click","click","click")
-     * condentropy(Y,X)
-     * [1] 0
-     */ 
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void dupValStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(condEntropy); 
-    
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "38.61010	click",
-                     "97.10575	view",
-                     "62.28313	view",
-                     "38.61010	view",
-                     "32.05370	view",
-                     "96.10962	click",
-                     "38.61010	click",
-                     "96.10962	view",
-                     "20.41135	click");
-        
-    test.runScript();
- 
-    /*
-     * library(infotheo)
-     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
-     * Y=c("click","click","view","view","view","view","click","click","view","click")
-     * condentropy(Y,X)
-     * [1] 0.3295837 
-     */    
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.3295837);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void emptyInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(condEntropy);
-    
-    writeLinesToFile("input"
-                     );
-
-    test.runScript();
-    
-    List<Double> expectedOutput = new ArrayList<Double>();
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void singleElemInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(condEntropy);
-    
-    writeLinesToFile("input",
-                     "98.94791	view");
-
-    test.runScript();
-     /*
-     * library(infotheo)
-     * X = c("98.94791")
-     * Y = c("view")
-     * condentropy(Y,X)
-     * [1] 0
-     */      
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingCondEntropy();
-  
-  data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
-  data = foreach data generate (valX1, valX2) as X, valY as Y;
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY *;
-                     GENERATE Entropy(data_ordered);
-             };
-
-  store data_out into 'output';
-   */
-  @Multiline private String pairCondEntropy;
- 
-  @Test
-  public void dupPairValStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(pairCondEntropy);
-    
-    writeLinesToFile("input",
-                     "hadoop	bigdata	click",
-                     "hadoop	pig	view",
-                     "hadoop	datafu	click",
-                     "datafu	pig	click",
-                     "bigdata	pig	view",
-                     "datafu	pig	click",
-                     "datafu	pig	view",
-                     "hadoop	bigdata	view",
-                     "pig	datafu	view",
-                     "pig	datafu	view");
-        
-    test.runScript();
-
-    /*
-     * library(infotheo)
-     * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
-     * Y=c("click","view","click","click","view","click","view","view","view","view")
-     * condentropy(X,Y)
-     * [1] 0.3295837
-     */   
- 
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.3295837);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
-  }
-
-  /**
-  register $JAR_PATH
-
-  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy('$type','$base');
-  
-  data = load 'input' as (valX:double, valY:chararray);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY *;
-                     GENERATE CondEntropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String condLogEntropy;
- 
-  @Test
-  public void dupValStreamingEmpiricalCondEntropoyLog2Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
- 
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "38.61010	click",
-                     "97.10575	view",
-                     "62.28313	view",
-                     "38.61010	view",
-                     "32.05370	view",
-                     "96.10962	click",
-                     "38.61010	click",
-                     "96.10962	view",
-                     "20.41135	click");
- 
-    test.runScript();
- 
-    /*
-     * library(infotheo)
-     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
-     * Y=c("click","click","view","view","view","view","click","click","view","click")
-     * condentropy(Y,X)/log(2)
-     * [1] 0.4754888 
-     */       
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.4754888);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void dupValStreamingEmpiricalCondEntropoyLog10Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
- 
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "38.61010	click",
-                     "97.10575	view",
-                     "62.28313	view",
-                     "38.61010	view",
-                     "32.05370	view",
-                     "96.10962	click",
-                     "38.61010	click",
-                     "96.10962	view",
-                     "20.41135	click");
-    
-    test.runScript();
- 
-    /*
-     * library(infotheo)
-     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
-     * Y=c("click","click","view","view","view","view","click","click","view","click")
-     * condentropy(Y,X)/log(10)
-     * [1] 0.1431364 
-     */      
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.1431364);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5); 
-  }
-
-  /**
-  register $JAR_PATH
-
-  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-  
-  data = load 'input' as (valX:double, valY:chararray);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     GENERATE CondEntropy(data);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String noOrderCondEntropy;
-  
-  @Test
-  public void noOrderStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(noOrderCondEntropy); 
-    
-    writeLinesToFile("input",
-                     "98.94791	click",
-                     "38.61010	view",
-                     "97.10575	view",
-                     "62.28313	click",
-                     "38.83960	click",
-                     "32.05370	view",
-                     "96.10962	view",
-                     "28.72388	click",
-                     "96.65888	view",
-                     "20.41135	click");
-
-    try {
-         test.runScript();
-         List<Tuple> output = this.getLinesForAlias(test, "data_out");
-         fail( "Testcase should fail");
-    } catch(Exception ex) {}
-  }
-
-  /**
-  register $JAR_PATH
-
-  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-  
-  data = load 'input' as (valX:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY *;
-                     GENERATE CondEntropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String invalidInputCondEntropy;
- 
-  @Test
-  public void invalidInputStreamingEmpiricalCondEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(invalidInputCondEntropy); 
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.83960",
-                     "32.05370",
-                     "96.10962",
-                     "28.72388",
-                     "96.65888",
-                     "20.41135");
-
-    try {
-         test.runScript();
-         List<Tuple> output = this.getLinesForAlias(test, "data_out");
-         fail( "Testcase should fail");
-    } catch(Exception ex) {
-         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
-    }
-  }
-
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
deleted file mode 100644
index a98d79f..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-/*
- * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
- * used as our test benchmark 
- */
-public class StreamingEmpiricalEntropyTests extends AbstractEntropyTests
-{
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy();
-  
-  data = load 'input' as (val:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY val;
-                     GENERATE Entropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String entropy;
-  
-  @Test
-  public void uniqValStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy);
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.83960",
-                     "32.05370",
-                     "96.10962",
-                     "28.72388",
-                     "96.65888",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        1        1        1        1        1        1        1        1 
-     * > count=c(1,1,1,1,1,1,1,1,1,1)
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 2.302585
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(2.302585);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void singleValStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy);
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791",
-                     "98.94791");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
-     * > table(v)
-     * v
-     * 98.94791 
-     * 10 
-     * > count=(10)
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 0
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void dupValStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy);
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 1.834372
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(1.834372);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void emptyInputBagStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy);
-    
-    writeLinesToFile("input"
-                     );
-
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c() 
-     * > table(v)
-     * < table of extent 0 > 
-     * > count=c()
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 0 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void singleElemInputBagStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(entropy);
-    
-    writeLinesToFile("input",
-                     "98.94791");
-
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > count=c(1)
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 0
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.0);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
-  
-  data = load 'input' as (x:chararray, y:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY x,y;
-                     GENERATE Entropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String pairEntropy;
- 
-  @Test
-  public void dupPairValStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
-    
-    writeLinesToFile("input",
-                     "hadoop	98.94791",
-                     "bigdata	38.61010",
-                     "hadoop	97.10575",
-                     "datafu	32.05370",
-                     "bigdata	38.61010",
-                     "datafu	32.05370",
-                     "datafu	32.05370",
-                     "hadoop	38.61010",
-                     "pig	96.10962",
-                     "pig	20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
-     * > t <- t[order(x,y)]
-     * > count<-c(2,3,1,1,1,1,1)
-     * > library(entropy)
-     * > entropy(count)
-     * [1] 1.834372 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(1.834372);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
-  
-  data = load 'input' as (val:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     data_ordered = ORDER data BY val;
-                     GENERATE Entropy(data_ordered);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String logEntropy;
- 
-  @Test
-  public void dupValStreamingEmpiricalEntropoyLog2Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > freqs=count/sum(count)
-     * > library(entropy)
-     * > entropy(count, freqs, c("ML"), c("log2")) 
-     * [1] 2.646439 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(2.646439);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  @Test
-  public void dupValStreamingEmpiricalEntropoyLog10Test() throws Exception
-  {
-    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "97.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-        
-    test.runScript();
-    
-    /* Add expected values, computed using R:
-     * 
-     * e.g.
-     * 
-     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
-     * > table(v)
-     * v
-     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
-     * 1        1        3        1        2        1        1 
-     * > count=c(1,1,3,1,2,1,1)
-     * > freqs=count/sum(count)
-     * > library(entropy)
-     * > entropy(count, freqs, c("ML"), c("log10")) 
-     * [1] 0.7966576 
-     * 
-     */
-    List<Double> expectedOutput = new ArrayList<Double>();
-    expectedOutput.add(0.7966576);
-    
-    List<Tuple> output = this.getLinesForAlias(test, "data_out");
-    verifyEqualEntropyOutput(expectedOutput, output, 5);
-  }
-
-  /**
-  register $JAR_PATH
-
-  define Entropy datafu.pig.stats.entropy.StreamingEntropy();
-  
-  data = load 'input' as (val:double);
-  --describe data;
-  data_grouped = GROUP data ALL;
-  data_out = FOREACH data_grouped {
-                     GENERATE Entropy(data);
-             };
-  store data_out into 'output';
-   */
-  @Multiline private String noOrderEntropy;
- 
-  @Test
-  public void noOrderStreamingEmpiricalEntropoyTest() throws Exception
-  {
-    PigTest test = createPigTestFromString(noOrderEntropy);
-    
-    writeLinesToFile("input",
-                     "98.94791",
-                     "38.61010",
-                     "38.61010",
-                     "37.10575",
-                     "62.28313",
-                     "38.61010",
-                     "32.05370",
-                     "96.10962",
-                     "38.61010",
-                     "96.10962",
-                     "20.41135");
-
-    try {
-         test.runScript();
-         List<Tuple> output = this.getLinesForAlias(test, "data_out");
-         fail( "Testcase should fail");
-    } catch(Exception ex) {
-    }
-  }
-}


Mime
View raw message