datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject [1/2] DATAFU-2 UDFs for entropy and weighted sampling algorithms
Date Wed, 22 Jan 2014 18:45:08 GMT
Updated Branches:
  refs/heads/master 9d7dffd65 -> e80841468


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/e8084146/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
new file mode 100644
index 0000000..8ef94c3
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
@@ -0,0 +1,585 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark 
+ */
+public class EntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+  
+  @Test
+  public void uniqValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 2.302585
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.302585);
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void emptyInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void singleElemInputBagEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$base');
+
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data BY (x, y);
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data);
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String pairLogEntropy;
+ 
+  @Test
+  public void dupPairValEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy('$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String logEntropy;
+ 
+  @Test
+  public void dupValEntropyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log2"))
+     * [1] 2.646439
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.646439);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  @Test
+  public void dupValEntropyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count, count/sum(count), c("ML"),c("log10"))
+     * [1] 0.7966576
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.7966576);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data_cnt = load 'input' as (val:int);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawValidInputEntropy;
+ 
+  @Test
+  public void rawValidInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(count) 
+     * [1] 1.846901 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.846901);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data_cnt = load 'input' as (val:double);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String rawInvalidTypeInputEntropy;
+ 
+  @Test
+  public void rawInvalidTypeInputEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0.0",
+                     "38.0",
+                     "0.0",
+                     "62.0",
+                     "38.0",
+                     "32.001",
+                     "96.002",
+                     "38.01",
+                     "96.00001",
+                     "0.0");
+     try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+     } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
+     }
+  }
+
+  @Test
+  public void rawInValidInputValueEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(rawValidInputEntropy); 
+    
+    writeLinesToFile("input",
+                     "0",
+                     "-38",
+                     "0",
+                     "62",
+                     "38",
+                     "32",
+                     "96",
+                     "38",
+                     "96",
+                     "0");
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
+     * > library(entropy)
+     * > entropy(ifelse(count>0,count,0))
+     * [1] 1.693862 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.693862);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data_cnt = load 'input' as (f1:chararray, f2:chararray);
+  --describe data_cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputSchemaEntropy;
+ 
+  @Test
+  public void invalidInputSchemaEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputSchemaEntropy); 
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370");
+        
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");    
+    } catch (Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
+    }
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.Entropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data BY val;
+  data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+  data_cnt_grouped = GROUP data_cnt ALL;
+  data_out = FOREACH data_cnt_grouped  {
+                          data_cnt_ordered = order data_cnt by *;
+                          GENERATE Entropy(data_cnt_ordered);
+                          }
+  store data_out into 'output';
+   */
+  @Multiline private String accumulatedEntropy;
+
+  @Test
+  public void accumulatedEntropyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(accumulatedEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    //the same output as @test dupValEntropyTest
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/e8084146/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
new file mode 100644
index 0000000..77a8e8b
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
@@ -0,0 +1,373 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark
+ */
+public class StreamingChaoShenEntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+
+  @Test  
+  public void uniqValStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 4.816221
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(4.816221);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void singleValStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 2.57429 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.57429);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+
+  @Test  
+  public void emptyInputBagStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void singleElemInputBagStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
+
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY x,y;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String pairEntropy;
+ 
+  @Test  
+  public void dupPairValStreamingChaoShenEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairEntropy, "type=chaosh", "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log"))
+     * [1] 2.57429 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.57429);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValStreamingChaoShenEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log2"))
+     * [1] 3.713915 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(3.713915);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test  
+  public void dupValStreamingChaoShenEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count,count/sum(count),c("CS"),c("log10"))
+     * [1] 1.118 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.118);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/e8084146/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
new file mode 100644
index 0000000..151bf8b
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+/*
+ * Use R function to compute condition entropy as the test benchmark
+ * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
+ */
+public class StreamingEmpiricalCondEntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condEntropy;
+  
+  @Test
+  public void uniqValStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+        
+    test.runScript();
+   
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
+     * Y=c("click","view","view","click","click","view","view","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleValStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click",
+                     "98.94791	click");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
+     * Y=c("click","click","click","click","click","click","click","click","click","click")
+     * condentropy(Y,X)
+     * [1] 0
+     */ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+        
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)
+     * [1] 0.3295837 
+     */    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void emptyInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleElemInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(condEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791	view");
+
+    test.runScript();
+     /*
+     * library(infotheo)
+     * X = c("98.94791")
+     * Y = c("view")
+     * condentropy(Y,X)
+     * [1] 0
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingCondEntropy();
+  
+  data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
+  data = foreach data generate (valX1, valX2) as X, valY as Y;
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE Entropy(data_ordered);
+             };
+
+  store data_out into 'output';
+   */
+  @Multiline private String pairCondEntropy;
+ 
+  @Test
+  public void dupPairValStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairCondEntropy);
+    
+    writeLinesToFile("input",
+                     "hadoop	bigdata	click",
+                     "hadoop	pig	view",
+                     "hadoop	datafu	click",
+                     "datafu	pig	click",
+                     "bigdata	pig	view",
+                     "datafu	pig	click",
+                     "datafu	pig	view",
+                     "hadoop	bigdata	view",
+                     "pig	datafu	view",
+                     "pig	datafu	view");
+        
+    test.runScript();
+
+    /*
+     * library(infotheo)
+     * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
+     * Y=c("click","view","click","click","view","click","view","view","view","view")
+     * condentropy(X,Y)
+     * [1] 0.3295837
+     */   
+ 
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.3295837);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy('$type','$base');
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String condLogEntropy;
+ 
+  @Test
+  public void dupValStreamingEmpiricalCondEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+ 
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(2)
+     * [1] 0.4754888 
+     */       
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.4754888);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValStreamingEmpiricalCondEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
+ 
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	click",
+                     "97.10575	view",
+                     "62.28313	view",
+                     "38.61010	view",
+                     "32.05370	view",
+                     "96.10962	click",
+                     "38.61010	click",
+                     "96.10962	view",
+                     "20.41135	click");
+    
+    test.runScript();
+ 
+    /*
+     * library(infotheo)
+     * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+     * Y=c("click","click","view","view","view","view","click","click","view","click")
+     * condentropy(Y,X)/log(10)
+     * [1] 0.1431364 
+     */      
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.1431364);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5); 
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
+  
+  data = load 'input' as (valX:double, valY:chararray);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE CondEntropy(data);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String noOrderCondEntropy;
+  
+  @Test
+  public void noOrderStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(noOrderCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791	click",
+                     "38.61010	view",
+                     "97.10575	view",
+                     "62.28313	click",
+                     "38.83960	click",
+                     "32.05370	view",
+                     "96.10962	view",
+                     "28.72388	click",
+                     "96.65888	view",
+                     "20.41135	click");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {}
+  }
+
+  /**
+  register $JAR_PATH
+
+  define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
+  
+  data = load 'input' as (valX:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY *;
+                     GENERATE CondEntropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String invalidInputCondEntropy;
+ 
+  @Test
+  public void invalidInputStreamingEmpiricalCondEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(invalidInputCondEntropy); 
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+         assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
+    }
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/e8084146/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
new file mode 100644
index 0000000..a98d79f
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark 
+ */
+public class StreamingEmpiricalEntropyTests extends AbstractEntropyTests
+{
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String entropy;
+  
+  @Test
+  public void uniqValStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.83960",
+                     "32.05370",
+                     "96.10962",
+                     "28.72388",
+                     "96.65888",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        1        1        1        1        1        1        1        1 
+     * > count=c(1,1,1,1,1,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 2.302585
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.302585);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleValStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791",
+                     "98.94791");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791) 
+     * > table(v)
+     * v
+     * 98.94791 
+     * 10 
+     * > count=(10)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135  32.0537  38.6101 62.28313 96.10962 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void emptyInputBagStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input"
+                     );
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c() 
+     * > table(v)
+     * < table of extent 0 > 
+     * > count=c()
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void singleElemInputBagStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(entropy);
+    
+    writeLinesToFile("input",
+                     "98.94791");
+
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > count=c(1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 0
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.0);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
+  
+  data = load 'input' as (x:chararray, y:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY x,y;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String pairEntropy;
+ 
+  @Test
+  public void dupPairValStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
+    
+    writeLinesToFile("input",
+                     "hadoop	98.94791",
+                     "bigdata	38.61010",
+                     "hadoop	97.10575",
+                     "datafu	32.05370",
+                     "bigdata	38.61010",
+                     "datafu	32.05370",
+                     "datafu	32.05370",
+                     "hadoop	38.61010",
+                     "pig	96.10962",
+                     "pig	20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+     * > t <- t[order(x,y)]
+     * > count<-c(2,3,1,1,1,1,1)
+     * > library(entropy)
+     * > entropy(count)
+     * [1] 1.834372 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(1.834372);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     data_ordered = ORDER data BY val;
+                     GENERATE Entropy(data_ordered);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String logEntropy;
+ 
+  @Test
+  public void dupValStreamingEmpiricalEntropoyLog2Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count, freqs, c("ML"), c("log2")) 
+     * [1] 2.646439 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(2.646439);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  @Test
+  public void dupValStreamingEmpiricalEntropoyLog10Test() throws Exception
+  {
+    PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "97.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+        
+    test.runScript();
+    
+    /* Add expected values, computed using R:
+     * 
+     * e.g.
+     * 
+     * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135) 
+     * > table(v)
+     * v
+     * 20.41135 28.72388  32.0537  38.6101  38.8396 62.28313 96.10962 96.65888 97.10575 98.94791 
+     * 1        1        3        1        2        1        1 
+     * > count=c(1,1,3,1,2,1,1)
+     * > freqs=count/sum(count)
+     * > library(entropy)
+     * > entropy(count, freqs, c("ML"), c("log10")) 
+     * [1] 0.7966576 
+     * 
+     */
+    List<Double> expectedOutput = new ArrayList<Double>();
+    expectedOutput.add(0.7966576);
+    
+    List<Tuple> output = this.getLinesForAlias(test, "data_out");
+    verifyEqualEntropyOutput(expectedOutput, output, 5);
+  }
+
+  /**
+  register $JAR_PATH
+
+  define Entropy datafu.pig.stats.entropy.StreamingEntropy();
+  
+  data = load 'input' as (val:double);
+  --describe data;
+  data_grouped = GROUP data ALL;
+  data_out = FOREACH data_grouped {
+                     GENERATE Entropy(data);
+             };
+  store data_out into 'output';
+   */
+  @Multiline private String noOrderEntropy;
+ 
+  @Test
+  public void noOrderStreamingEmpiricalEntropoyTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(noOrderEntropy);
+    
+    writeLinesToFile("input",
+                     "98.94791",
+                     "38.61010",
+                     "38.61010",
+                     "37.10575",
+                     "62.28313",
+                     "38.61010",
+                     "32.05370",
+                     "96.10962",
+                     "38.61010",
+                     "96.10962",
+                     "20.41135");
+
+    try {
+         test.runScript();
+         List<Tuple> output = this.getLinesForAlias(test, "data_out");
+         fail( "Testcase should fail");
+    } catch(Exception ex) {
+    }
+  }
+}


Mime
View raw message