Repository: incubator-datafu
Updated Branches:
refs/heads/master b2134e660 -> 4aa2ef2a4
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
new file mode 100644
index 0000000..c6d62df
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/CondEntropyTests.java
@@ -0,0 +1,412 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+
+/*
+ * Use R function to compute condition entropy as the test benchmark
+ * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
+ */
+public class CondEntropyTests extends AbstractEntropyTests
+{
+ /**
+ register $JAR_PATH
+
+ define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+
+ data = load 'input' as (valX:double, valY:chararray);
+ --describe data;
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY *;
+ GENERATE CondEntropy(data_ordered);
+ };
+ store data_out into 'output';
+ */
+ @Multiline private String condEntropy;
+
+ @Test
+ public void uniqValEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(condEntropy);
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "38.61010 view",
+ "97.10575 view",
+ "62.28313 click",
+ "38.83960 click",
+ "32.05370 view",
+ "96.10962 view",
+ "28.72388 click",
+ "96.65888 view",
+ "20.41135 click");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
+ * Y=c("click","view","view","click","click","view","view","click","view","click")
+ * condentropy(Y,X)
+ * [1] 0
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.0);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void singleValEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(condEntropy);
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click",
+ "98.94791 click");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
+ * Y=c("click","click","click","click","click","click","click","click","click","click")
+ * condentropy(Y,X)
+ * [1] 0
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.0);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void dupValEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(condEntropy);
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "38.61010 click",
+ "97.10575 view",
+ "62.28313 view",
+ "38.61010 view",
+ "32.05370 view",
+ "96.10962 click",
+ "38.61010 click",
+ "96.10962 view",
+ "20.41135 click");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+ * Y=c("click","click","view","view","view","view","click","click","view","click")
+ * condentropy(Y,X)
+ * [1] 0.3295837
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.3295837);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void emptyInputBagEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(condEntropy);
+
+ writeLinesToFile("input"
+ );
+
+ test.runScript();
+
+ List<Double> expectedOutput = new ArrayList<Double>();
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void singleElemInputBagEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(condEntropy);
+
+ writeLinesToFile("input",
+ "98.94791 view");
+
+ test.runScript();
+ /*
+ * library(infotheo)
+ * X = c("98.94791")
+ * Y = c("view")
+ * condentropy(Y,X)
+ * [1] 0
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.0);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.CondEntropy();
+
+ data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
+ data = foreach data generate (valX1, valX2) as X, valY as Y;
+ --describe data;
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY *;
+ GENERATE Entropy(data_ordered);
+ };
+
+ store data_out into 'output';
+ */
+ @Multiline private String pairCondEntropy;
+
+ @Test
+ public void dupPairValEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(pairCondEntropy);
+
+ writeLinesToFile("input",
+ "hadoop bigdata click",
+ "hadoop pig view",
+ "hadoop datafu click",
+ "datafu pig click",
+ "bigdata pig view",
+ "datafu pig click",
+ "datafu pig view",
+ "hadoop bigdata view",
+ "pig datafu view",
+ "pig datafu view");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
+ * Y=c("click","view","click","click","view","click","view","view","view","view")
+ * condentropy(X,Y)
+ * [1] 0.3295837
+ */
+
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.3295837);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define CondEntropy datafu.pig.stats.entropy.CondEntropy('$type','$base');
+
+ data = load 'input' as (valX:double, valY:chararray);
+ --describe data;
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY *;
+ GENERATE CondEntropy(data_ordered);
+ };
+ store data_out into 'output';
+ */
+ @Multiline private String condLogEntropy;
+
+ @Test
+ public void dupValEmpiricalCondEntropoyLog2Test() throws Exception
+ {
+ PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "38.61010 click",
+ "97.10575 view",
+ "62.28313 view",
+ "38.61010 view",
+ "32.05370 view",
+ "96.10962 click",
+ "38.61010 click",
+ "96.10962 view",
+ "20.41135 click");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+ * Y=c("click","click","view","view","view","view","click","click","view","click")
+ * condentropy(Y,X)/log(2)
+ * [1] 0.4754888
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.4754888);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void dupValEmpiricalCondEntropoyLog10Test() throws Exception
+ {
+ PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "38.61010 click",
+ "97.10575 view",
+ "62.28313 view",
+ "38.61010 view",
+ "32.05370 view",
+ "96.10962 click",
+ "38.61010 click",
+ "96.10962 view",
+ "20.41135 click");
+
+ test.runScript();
+
+ /*
+ * library(infotheo)
+ * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
+ * Y=c("click","click","view","view","view","view","click","click","view","click")
+ * condentropy(Y,X)/log(10)
+ * [1] 0.1431364
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.1431364);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+
+ data = load 'input' as (valX:double, valY:chararray);
+ --describe data;
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ GENERATE CondEntropy(data);
+ };
+ store data_out into 'output';
+ */
+ @Multiline private String noOrderCondEntropy;
+
+ @Test
+ public void noOrderEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(noOrderCondEntropy);
+
+ writeLinesToFile("input",
+ "98.94791 click",
+ "38.61010 view",
+ "97.10575 view",
+ "62.28313 click",
+ "38.83960 click",
+ "32.05370 view",
+ "96.10962 view",
+ "28.72388 click",
+ "96.65888 view",
+ "20.41135 click");
+
+ try {
+ test.runScript();
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ fail( "Testcase should fail");
+ } catch(Exception ex) {}
+ }
+
+ /**
+ register $JAR_PATH
+
+ define CondEntropy datafu.pig.stats.entropy.CondEntropy();
+
+ data = load 'input' as (valX:double);
+ --describe data;
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY *;
+ GENERATE CondEntropy(data_ordered);
+ };
+ store data_out into 'output';
+ */
+ @Multiline private String invalidInputCondEntropy;
+
+ @Test
+ public void invalidInputEmpiricalCondEntropoyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(invalidInputCondEntropy);
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.83960",
+ "32.05370",
+ "96.10962",
+ "28.72388",
+ "96.65888",
+ "20.41135");
+
+ try {
+ test.runScript();
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ fail( "Testcase should fail");
+ } catch(Exception ex) {
+ assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
new file mode 100644
index 0000000..52fdcce
--- /dev/null
+++ b/test/pig/datafu/test/pig/stats/entropy/EmpiricalCountEntropyTests.java
@@ -0,0 +1,585 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package datafu.test.pig.stats.entropy;
+
+import static org.testng.Assert.*;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.adrianwalker.multilinestring.Multiline;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.testng.annotations.Test;
+
+import datafu.test.pig.PigTests;
+
+/*
+ * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
+ * used as our test benchmark
+ */
+public class EmpiricalCountEntropyTests extends AbstractEntropyTests
+{
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+
+ data = load 'input' as (val:double);
+ --describe data;
+ data_grouped = GROUP data BY val;
+ data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String entropy;
+
+ @Test
+ public void uniqValEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(entropy);
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.83960",
+ "32.05370",
+ "96.10962",
+ "28.72388",
+ "96.65888",
+ "20.41135");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135)
+ * > table(v)
+ * v
+ * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
+ * 1 1 1 1 1 1 1 1 1 1
+ * > count=c(1,1,1,1,1,1,1,1,1,1)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 2.302585
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(2.302585);
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void singleValEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(entropy);
+
+ writeLinesToFile("input",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791",
+ "98.94791");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791)
+ * > table(v)
+ * v
+ * 98.94791
+ * 10
+ * > count=(10)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 0
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.0);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void dupValEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(entropy);
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.61010",
+ "32.05370",
+ "96.10962",
+ "38.61010",
+ "96.10962",
+ "20.41135");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
+ * > table(v)
+ * v
+ * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
+ * 1 1 3 1 2 1 1
+ * > count=c(1,1,3,1,2,1,1)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 1.834372
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(1.834372);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void emptyInputBagEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(entropy);
+
+ writeLinesToFile("input"
+ );
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c()
+ * > table(v)
+ * < table of extent 0 >
+ * > count=c()
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 0
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void singleElemInputBagEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(entropy);
+
+ writeLinesToFile("input",
+ "98.94791");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > count=c(1)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 0
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.0);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+
+ data = load 'input' as (x:chararray, y:double);
+ --describe data;
+ data_grouped = GROUP data BY (x, y);
+ data_cnt = FOREACH data_grouped GENERATE COUNT(data);
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String pairLogEntropy;
+
+ @Test
+ public void dupPairValEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+
+ writeLinesToFile("input",
+ "hadoop 98.94791",
+ "bigdata 38.61010",
+ "hadoop 97.10575",
+ "datafu 32.05370",
+ "bigdata 38.61010",
+ "datafu 32.05370",
+ "datafu 32.05370",
+ "hadoop 38.61010",
+ "pig 96.10962",
+ "pig 20.41135");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
+ * > t <- t[order(x,y)]
+ * > count<-c(2,3,1,1,1,1,1)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 1.834372
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(1.834372);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy('$base');
+
+ data = load 'input' as (val:double);
+ --describe data;
+ data_grouped = GROUP data BY val;
+ data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String logEntropy;
+
+ @Test
+ public void dupValEntropyLog2Test() throws Exception
+ {
+ PigTest test = createPigTestFromString(logEntropy, "base=log2");
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.61010",
+ "32.05370",
+ "96.10962",
+ "38.61010",
+ "96.10962",
+ "20.41135");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
+ * > table(v)
+ * v
+ * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
+ * 1 1 3 1 2 1 1
+ * > count=c(1,1,3,1,2,1,1)
+ * > library(entropy)
+ * > entropy(count, count/sum(count), c("ML"),c("log2"))
+ * [1] 2.646439
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(2.646439);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ @Test
+ public void dupValEntropyLog10Test() throws Exception
+ {
+ PigTest test = createPigTestFromString(logEntropy, "base=log10");
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.61010",
+ "32.05370",
+ "96.10962",
+ "38.61010",
+ "96.10962",
+ "20.41135");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
+ * > table(v)
+ * v
+ * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
+ * 1 1 3 1 2 1 1
+ * > count=c(1,1,3,1,2,1,1)
+ * > library(entropy)
+ * > entropy(count, count/sum(count), c("ML"),c("log10"))
+ * [1] 0.7966576
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(0.7966576);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+
+ data_cnt = load 'input' as (val:int);
+ --describe data_cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String rawValidInputEntropy;
+
+ @Test
+ public void rawValidInputEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(rawValidInputEntropy);
+
+ writeLinesToFile("input",
+ "0",
+ "38",
+ "0",
+ "62",
+ "38",
+ "32",
+ "96",
+ "38",
+ "96",
+ "0");
+
+ test.runScript();
+
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
+ * > library(entropy)
+ * > entropy(count)
+ * [1] 1.846901
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(1.846901);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+
+ data_cnt = load 'input' as (val:double);
+ --describe data_cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String rawInvalidTypeInputEntropy;
+
+ @Test
+ public void rawInvalidTypeInputEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy);
+
+ writeLinesToFile("input",
+ "0.0",
+ "38.0",
+ "0.0",
+ "62.0",
+ "38.0",
+ "32.001",
+ "96.002",
+ "38.01",
+ "96.00001",
+ "0.0");
+ try {
+ test.runScript();
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ fail( "Testcase should fail");
+ } catch (Exception ex) {
+ assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
+ }
+ }
+
+ @Test
+ public void rawInValidInputValueEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(rawValidInputEntropy);
+
+ writeLinesToFile("input",
+ "0",
+ "-38",
+ "0",
+ "62",
+ "38",
+ "32",
+ "96",
+ "38",
+ "96",
+ "0");
+ /* Add expected values, computed using R:
+ *
+ * e.g.
+ *
+ * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
+ * > library(entropy)
+ * > entropy(ifelse(count>0,count,0))
+ * [1] 1.693862
+ *
+ */
+ List<Double> expectedOutput = new ArrayList<Double>();
+ expectedOutput.add(1.693862);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+
+ data_cnt = load 'input' as (f1:chararray, f2:chararray);
+ --describe data_cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ store data_out into 'output';
+ */
+ @Multiline private String invalidInputSchemaEntropy;
+
+ @Test
+ public void invalidInputSchemaEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(invalidInputSchemaEntropy);
+
+ writeLinesToFile("input",
+ "hadoop 98.94791",
+ "bigdata 38.61010",
+ "hadoop 97.10575",
+ "datafu 32.05370",
+ "bigdata 38.61010",
+ "datafu 32.05370",
+ "datafu 32.05370");
+
+ try {
+ test.runScript();
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ fail( "Testcase should fail");
+ } catch (Exception ex) {
+ assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
+ }
+ }
+
+ /**
+ register $JAR_PATH
+
+ define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
+
+ data = load 'input' as (val:double);
+ --describe data;
+ data_grouped = GROUP data BY val;
+ data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
+ data_cnt_grouped = GROUP data_cnt ALL;
+ data_out = FOREACH data_cnt_grouped {
+ data_cnt_ordered = order data_cnt by *;
+ GENERATE Entropy(data_cnt_ordered);
+ }
+ store data_out into 'output';
+ */
+ @Multiline private String accumulatedEntropy;
+
+ @Test
+ public void accumulatedEntropyTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(accumulatedEntropy);
+
+ writeLinesToFile("input",
+ "98.94791",
+ "38.61010",
+ "97.10575",
+ "62.28313",
+ "38.61010",
+ "32.05370",
+ "96.10962",
+ "38.61010",
+ "96.10962",
+ "20.41135");
+
+ test.runScript();
+
+ List<Double> expectedOutput = new ArrayList<Double>();
+ //the same output as @test dupValEntropyTest
+ expectedOutput.add(1.834372);
+
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
index 8ef94c3..e1611a7 100644
--- a/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
+++ b/test/pig/datafu/test/pig/stats/entropy/EntropyTests.java
@@ -45,16 +45,17 @@ public class EntropyTests extends AbstractEntropyTests
data = load 'input' as (val:double);
--describe data;
- data_grouped = GROUP data BY val;
- data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY val;
+ GENERATE Entropy(data_ordered);
+ };
store data_out into 'output';
*/
@Multiline private String entropy;
@Test
- public void uniqValEntropyTest() throws Exception
+ public void uniqValEmpiricalEntropoyTest() throws Exception
{
PigTest test = createPigTestFromString(entropy);
@@ -89,12 +90,13 @@ public class EntropyTests extends AbstractEntropyTests
*/
List<Double> expectedOutput = new ArrayList<Double>();
expectedOutput.add(2.302585);
+
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
@Test
- public void singleValEntropyTest() throws Exception
+ public void singleValEmpiricalEntropoyTest() throws Exception
{
PigTest test = createPigTestFromString(entropy);
@@ -131,11 +133,11 @@ public class EntropyTests extends AbstractEntropyTests
expectedOutput.add(0.0);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
@Test
- public void dupValEntropyTest() throws Exception
+ public void dupValEmpiricalEntropoyTest() throws Exception
{
PigTest test = createPigTestFromString(entropy);
@@ -172,11 +174,11 @@ public class EntropyTests extends AbstractEntropyTests
expectedOutput.add(1.834372);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
@Test
- public void emptyInputBagEntropyTest() throws Exception
+ public void emptyInputBagEmpiricalEntropoyTest() throws Exception
{
PigTest test = createPigTestFromString(entropy);
@@ -201,11 +203,11 @@ public class EntropyTests extends AbstractEntropyTests
List<Double> expectedOutput = new ArrayList<Double>();
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
@Test
- public void singleElemInputBagEntropyTest() throws Exception
+ public void singleElemInputBagEmpiricalEntropoyTest() throws Exception
{
PigTest test = createPigTestFromString(entropy);
@@ -228,28 +230,29 @@ public class EntropyTests extends AbstractEntropyTests
expectedOutput.add(0.0);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
/**
register $JAR_PATH
- define Entropy datafu.pig.stats.entropy.Entropy('$base');
-
+ define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
+
data = load 'input' as (x:chararray, y:double);
--describe data;
- data_grouped = GROUP data BY (x, y);
- data_cnt = FOREACH data_grouped GENERATE COUNT(data);
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY x,y;
+ GENERATE Entropy(data_ordered);
+ };
store data_out into 'output';
*/
- @Multiline private String pairLogEntropy;
+ @Multiline private String pairEntropy;
@Test
- public void dupPairValEntropyTest() throws Exception
+ public void dupPairValEmpiricalEntropoyTest() throws Exception
{
- PigTest test = createPigTestFromString(pairLogEntropy, "base=log");
+ PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
writeLinesToFile("input",
"hadoop 98.94791",
@@ -280,28 +283,29 @@ public class EntropyTests extends AbstractEntropyTests
expectedOutput.add(1.834372);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
/**
register $JAR_PATH
- define Entropy datafu.pig.stats.entropy.Entropy('$base');
+ define Entropy datafu.pig.stats.entropy.Entropy('$type', '$base');
data = load 'input' as (val:double);
--describe data;
- data_grouped = GROUP data BY val;
- data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ data_ordered = ORDER data BY val;
+ GENERATE Entropy(data_ordered);
+ };
store data_out into 'output';
*/
@Multiline private String logEntropy;
@Test
- public void dupValEntropyLog2Test() throws Exception
+ public void dupValEmpiricalEntropoyLog2Test() throws Exception
{
- PigTest test = createPigTestFromString(logEntropy, "base=log2");
+ PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
writeLinesToFile("input",
"98.94791",
@@ -324,25 +328,26 @@ public class EntropyTests extends AbstractEntropyTests
* > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
* > table(v)
* v
- * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
+ * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
* 1 1 3 1 2 1 1
* > count=c(1,1,3,1,2,1,1)
+ * > freqs=count/sum(count)
* > library(entropy)
- * > entropy(count, count/sum(count), c("ML"),c("log2"))
- * [1] 2.646439
+ * > entropy(count, freqs, c("ML"), c("log2"))
+ * [1] 2.646439
*
*/
List<Double> expectedOutput = new ArrayList<Double>();
expectedOutput.add(2.646439);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
@Test
- public void dupValEntropyLog10Test() throws Exception
+ public void dupValEmpiricalEntropoyLog10Test() throws Exception
{
- PigTest test = createPigTestFromString(logEntropy, "base=log10");
+ PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
writeLinesToFile("input",
"98.94791",
@@ -365,177 +370,20 @@ public class EntropyTests extends AbstractEntropyTests
* > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
* > table(v)
* v
- * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
+ * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
* 1 1 3 1 2 1 1
* > count=c(1,1,3,1,2,1,1)
+ * > freqs=count/sum(count)
* > library(entropy)
- * > entropy(count, count/sum(count), c("ML"),c("log10"))
- * [1] 0.7966576
+ * > entropy(count, freqs, c("ML"), c("log10"))
+ * [1] 0.7966576
*
*/
List<Double> expectedOutput = new ArrayList<Double>();
expectedOutput.add(0.7966576);
List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.Entropy();
-
- data_cnt = load 'input' as (val:int);
- --describe data_cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
- store data_out into 'output';
- */
- @Multiline private String rawValidInputEntropy;
-
- @Test
- public void rawValidInputEntropyTest() throws Exception
- {
- PigTest test = createPigTestFromString(rawValidInputEntropy);
-
- writeLinesToFile("input",
- "0",
- "38",
- "0",
- "62",
- "38",
- "32",
- "96",
- "38",
- "96",
- "0");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > count=c(0, 38, 0, 62, 38, 32, 96, 38, 96, 0)
- * > library(entropy)
- * > entropy(count)
- * [1] 1.846901
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(1.846901);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.Entropy();
-
- data_cnt = load 'input' as (val:double);
- --describe data_cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
- store data_out into 'output';
- */
- @Multiline private String rawInvalidTypeInputEntropy;
-
- @Test
- public void rawInvalidTypeInputEntropyTest() throws Exception
- {
- PigTest test = createPigTestFromString(rawInvalidTypeInputEntropy);
-
- writeLinesToFile("input",
- "0.0",
- "38.0",
- "0.0",
- "62.0",
- "38.0",
- "32.001",
- "96.002",
- "38.01",
- "96.00001",
- "0.0");
- try {
- test.runScript();
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- fail( "Testcase should fail");
- } catch (Exception ex) {
- assertTrue(ex.getMessage().indexOf("Expect the type of the input tuple to be of ([int, long]), but instead found double") >= 0);
- }
- }
-
- @Test
- public void rawInValidInputValueEntropyTest() throws Exception
- {
- PigTest test = createPigTestFromString(rawValidInputEntropy);
-
- writeLinesToFile("input",
- "0",
- "-38",
- "0",
- "62",
- "38",
- "32",
- "96",
- "38",
- "96",
- "0");
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > count=c(0, -38, 0, 62, 38, 32, 96, 38, 96, 0)
- * > library(entropy)
- * > entropy(ifelse(count>0,count,0))
- * [1] 1.693862
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(1.693862);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
-
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.Entropy();
-
- data_cnt = load 'input' as (f1:chararray, f2:chararray);
- --describe data_cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped GENERATE Entropy(data_cnt);
- store data_out into 'output';
- */
- @Multiline private String invalidInputSchemaEntropy;
-
- @Test
- public void invalidInputSchemaEntropyTest() throws Exception
- {
- PigTest test = createPigTestFromString(invalidInputSchemaEntropy);
-
- writeLinesToFile("input",
- "hadoop 98.94791",
- "bigdata 38.61010",
- "hadoop 97.10575",
- "datafu 32.05370",
- "bigdata 38.61010",
- "datafu 32.05370",
- "datafu 32.05370");
-
- try {
- test.runScript();
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- fail( "Testcase should fail");
- } catch (Exception ex) {
- assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 1") >= 0);
- }
+ verifyEqualEntropyOutput(expectedOutput, output, 5);
}
/**
@@ -545,26 +393,24 @@ public class EntropyTests extends AbstractEntropyTests
data = load 'input' as (val:double);
--describe data;
- data_grouped = GROUP data BY val;
- data_cnt = FOREACH data_grouped GENERATE COUNT(data) AS cnt;
- data_cnt_grouped = GROUP data_cnt ALL;
- data_out = FOREACH data_cnt_grouped {
- data_cnt_ordered = order data_cnt by *;
- GENERATE Entropy(data_cnt_ordered);
- }
+ data_grouped = GROUP data ALL;
+ data_out = FOREACH data_grouped {
+ GENERATE Entropy(data);
+ };
store data_out into 'output';
*/
- @Multiline private String accumulatedEntropy;
-
+ @Multiline private String noOrderEntropy;
+
@Test
- public void accumulatedEntropyTest() throws Exception
+ public void noOrderEmpiricalEntropoyTest() throws Exception
{
- PigTest test = createPigTestFromString(accumulatedEntropy);
+ PigTest test = createPigTestFromString(noOrderEntropy);
writeLinesToFile("input",
"98.94791",
"38.61010",
- "97.10575",
+ "38.61010",
+ "37.10575",
"62.28313",
"38.61010",
"32.05370",
@@ -572,14 +418,12 @@ public class EntropyTests extends AbstractEntropyTests
"38.61010",
"96.10962",
"20.41135");
-
- test.runScript();
-
- List<Double> expectedOutput = new ArrayList<Double>();
- //the same output as @test dupValEntropyTest
- expectedOutput.add(1.834372);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
+
+ try {
+ test.runScript();
+ List<Tuple> output = this.getLinesForAlias(test, "data_out");
+ fail( "Testcase should fail");
+ } catch(Exception ex) {
+ }
}
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
deleted file mode 100644
index 77a8e8b..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingChaoShenEntropyTests.java
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-/*
- * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
- * used as our test benchmark
- */
-public class StreamingChaoShenEntropyTests extends AbstractEntropyTests
-{
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
-
- data = load 'input' as (val:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY val;
- GENERATE Entropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String entropy;
-
- @Test
- public void uniqValStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.83960",
- "32.05370",
- "96.10962",
- "28.72388",
- "96.65888",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 1 1 1 1 1 1 1 1
- * > count=c(1,1,1,1,1,1,1,1,1,1)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 4.816221
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(4.816221);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleValStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-
- writeLinesToFile("input",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791)
- * > table(v)
- * v
- * 98.94791
- * 10
- * > count=(10)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 2.57429
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(2.57429);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
-
- @Test
- public void emptyInputBagStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-
- writeLinesToFile("input"
- );
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c()
- * > table(v)
- * < table of extent 0 >
- * > count=c()
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleElemInputBagStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log");
-
- writeLinesToFile("input",
- "98.94791");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > count=c(1)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type','$base');
-
- data = load 'input' as (x:chararray, y:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY x,y;
- GENERATE Entropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String pairEntropy;
-
- @Test
- public void dupPairValStreamingChaoShenEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(pairEntropy, "type=chaosh", "base=log");
-
- writeLinesToFile("input",
- "hadoop 98.94791",
- "bigdata 38.61010",
- "hadoop 97.10575",
- "datafu 32.05370",
- "bigdata 38.61010",
- "datafu 32.05370",
- "datafu 32.05370",
- "hadoop 38.61010",
- "pig 96.10962",
- "pig 20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
- * > t <- t[order(x,y)]
- * > count<-c(2,3,1,1,1,1,1)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log"))
- * [1] 2.57429
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(2.57429);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingChaoShenEntropoyLog2Test() throws Exception
- {
- PigTest test = createPigTestFromString(entropy,"type=chaosh", "base=log2");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > freqs=count/sum(count)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log2"))
- * [1] 3.713915
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(3.713915);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingChaoShenEntropoyLog10Test() throws Exception
- {
- PigTest test = createPigTestFromString(entropy, "type=chaosh", "base=log10");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > library(entropy)
- * > entropy(count,count/sum(count),c("CS"),c("log10"))
- * [1] 1.118
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(1.118);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
deleted file mode 100644
index 151bf8b..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalCondEntropyTests.java
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-
-/*
- * Use R function to compute condition entropy as the test benchmark
- * http://cran.r-project.org/web/packages/infotheo/infotheo.pdf
- */
-public class StreamingEmpiricalCondEntropyTests extends AbstractEntropyTests
-{
- /**
- register $JAR_PATH
-
- define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-
- data = load 'input' as (valX:double, valY:chararray);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY *;
- GENERATE CondEntropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String condEntropy;
-
- @Test
- public void uniqValStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(condEntropy);
-
- writeLinesToFile("input",
- "98.94791 click",
- "38.61010 view",
- "97.10575 view",
- "62.28313 click",
- "38.83960 click",
- "32.05370 view",
- "96.10962 view",
- "28.72388 click",
- "96.65888 view",
- "20.41135 click");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("98.94791","38.61010","97.10575","62.28313","38.83960","32.05370","96.10962","28.72388","96.65888","20.41135")
- * Y=c("click","view","view","click","click","view","view","click","view","click")
- * condentropy(Y,X)
- * [1] 0
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleValStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(condEntropy);
-
- writeLinesToFile("input",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click",
- "98.94791 click");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791","98.94791")
- * Y=c("click","click","click","click","click","click","click","click","click","click")
- * condentropy(Y,X)
- * [1] 0
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(condEntropy);
-
- writeLinesToFile("input",
- "98.94791 click",
- "38.61010 click",
- "97.10575 view",
- "62.28313 view",
- "38.61010 view",
- "32.05370 view",
- "96.10962 click",
- "38.61010 click",
- "96.10962 view",
- "20.41135 click");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
- * Y=c("click","click","view","view","view","view","click","click","view","click")
- * condentropy(Y,X)
- * [1] 0.3295837
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.3295837);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void emptyInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(condEntropy);
-
- writeLinesToFile("input"
- );
-
- test.runScript();
-
- List<Double> expectedOutput = new ArrayList<Double>();
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleElemInputBagStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(condEntropy);
-
- writeLinesToFile("input",
- "98.94791 view");
-
- test.runScript();
- /*
- * library(infotheo)
- * X = c("98.94791")
- * Y = c("view")
- * condentropy(Y,X)
- * [1] 0
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingCondEntropy();
-
- data = load 'input' as (valX1:chararray, valX2:chararray, valY:chararray);
- data = foreach data generate (valX1, valX2) as X, valY as Y;
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY *;
- GENERATE Entropy(data_ordered);
- };
-
- store data_out into 'output';
- */
- @Multiline private String pairCondEntropy;
-
- @Test
- public void dupPairValStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(pairCondEntropy);
-
- writeLinesToFile("input",
- "hadoop bigdata click",
- "hadoop pig view",
- "hadoop datafu click",
- "datafu pig click",
- "bigdata pig view",
- "datafu pig click",
- "datafu pig view",
- "hadoop bigdata view",
- "pig datafu view",
- "pig datafu view");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("hadoop bigdata","hadoop pig","hadoop datafu","datafu pig","bigdata pig","datafu pig","datafu pig","hadoop bigdata","pig datafu","pig datafu")
- * Y=c("click","view","click","click","view","click","view","view","view","view")
- * condentropy(X,Y)
- * [1] 0.3295837
- */
-
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.3295837);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy('$type','$base');
-
- data = load 'input' as (valX:double, valY:chararray);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY *;
- GENERATE CondEntropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String condLogEntropy;
-
- @Test
- public void dupValStreamingEmpiricalCondEntropoyLog2Test() throws Exception
- {
- PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log2");
-
- writeLinesToFile("input",
- "98.94791 click",
- "38.61010 click",
- "97.10575 view",
- "62.28313 view",
- "38.61010 view",
- "32.05370 view",
- "96.10962 click",
- "38.61010 click",
- "96.10962 view",
- "20.41135 click");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
- * Y=c("click","click","view","view","view","view","click","click","view","click")
- * condentropy(Y,X)/log(2)
- * [1] 0.4754888
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.4754888);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingEmpiricalCondEntropoyLog10Test() throws Exception
- {
- PigTest test = createPigTestFromString(condLogEntropy, "type=empirical", "base=log10");
-
- writeLinesToFile("input",
- "98.94791 click",
- "38.61010 click",
- "97.10575 view",
- "62.28313 view",
- "38.61010 view",
- "32.05370 view",
- "96.10962 click",
- "38.61010 click",
- "96.10962 view",
- "20.41135 click");
-
- test.runScript();
-
- /*
- * library(infotheo)
- * X=c("98.94791","38.61010","97.10575","62.28313","38.61010","32.05370","96.10962","38.61010","96.10962","20.41135")
- * Y=c("click","click","view","view","view","view","click","click","view","click")
- * condentropy(Y,X)/log(10)
- * [1] 0.1431364
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.1431364);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-
- data = load 'input' as (valX:double, valY:chararray);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- GENERATE CondEntropy(data);
- };
- store data_out into 'output';
- */
- @Multiline private String noOrderCondEntropy;
-
- @Test
- public void noOrderStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(noOrderCondEntropy);
-
- writeLinesToFile("input",
- "98.94791 click",
- "38.61010 view",
- "97.10575 view",
- "62.28313 click",
- "38.83960 click",
- "32.05370 view",
- "96.10962 view",
- "28.72388 click",
- "96.65888 view",
- "20.41135 click");
-
- try {
- test.runScript();
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- fail( "Testcase should fail");
- } catch(Exception ex) {}
- }
-
- /**
- register $JAR_PATH
-
- define CondEntropy datafu.pig.stats.entropy.StreamingCondEntropy();
-
- data = load 'input' as (valX:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY *;
- GENERATE CondEntropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String invalidInputCondEntropy;
-
- @Test
- public void invalidInputStreamingEmpiricalCondEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(invalidInputCondEntropy);
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.83960",
- "32.05370",
- "96.10962",
- "28.72388",
- "96.65888",
- "20.41135");
-
- try {
- test.runScript();
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- fail( "Testcase should fail");
- } catch(Exception ex) {
- assertTrue(ex.getMessage().indexOf("The field schema of the input tuple is null or its size is not 2") >= 0);
- }
- }
-
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/4aa2ef2a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java b/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
deleted file mode 100644
index a98d79f..0000000
--- a/test/pig/datafu/test/pig/stats/entropy/StreamingEmpiricalEntropyTests.java
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.stats.entropy;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-/*
- * R's entropy library: http://cran.r-project.org/web/packages/entropy/entropy.pdf
- * used as our test benchmark
- */
-public class StreamingEmpiricalEntropyTests extends AbstractEntropyTests
-{
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy();
-
- data = load 'input' as (val:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY val;
- GENERATE Entropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String entropy;
-
- @Test
- public void uniqValStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy);
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.83960",
- "32.05370",
- "96.10962",
- "28.72388",
- "96.65888",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.83960,32.05370,96.10962,28.72388,96.65888,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 1 1 1 1 1 1 1 1
- * > count=c(1,1,1,1,1,1,1,1,1,1)
- * > library(entropy)
- * > entropy(count)
- * [1] 2.302585
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(2.302585);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleValStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy);
-
- writeLinesToFile("input",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791",
- "98.94791");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791,98.94791)
- * > table(v)
- * v
- * 98.94791
- * 10
- * > count=(10)
- * > library(entropy)
- * > entropy(count)
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy);
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 32.0537 38.6101 62.28313 96.10962 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > library(entropy)
- * > entropy(count)
- * [1] 1.834372
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(1.834372);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void emptyInputBagStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy);
-
- writeLinesToFile("input"
- );
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c()
- * > table(v)
- * < table of extent 0 >
- * > count=c()
- * > library(entropy)
- * > entropy(count)
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void singleElemInputBagStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(entropy);
-
- writeLinesToFile("input",
- "98.94791");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > count=c(1)
- * > library(entropy)
- * > entropy(count)
- * [1] 0
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.0);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
-
- data = load 'input' as (x:chararray, y:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY x,y;
- GENERATE Entropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String pairEntropy;
-
- @Test
- public void dupPairValStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(pairEntropy, "type=empirical", "base=log");
-
- writeLinesToFile("input",
- "hadoop 98.94791",
- "bigdata 38.61010",
- "hadoop 97.10575",
- "datafu 32.05370",
- "bigdata 38.61010",
- "datafu 32.05370",
- "datafu 32.05370",
- "hadoop 38.61010",
- "pig 96.10962",
- "pig 20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- * > t <- data.table(x=c("hadoop","bigdata","hadoop","datafu","bigdata","datafu","datafu","hadoop","pig","pig"),y=c(98.94791,38.61010,97.10575,32.05370,38.61010,32.05370,32.05370,38.61010,96.10962,20.41135))
- * > t <- t[order(x,y)]
- * > count<-c(2,3,1,1,1,1,1)
- * > library(entropy)
- * > entropy(count)
- * [1] 1.834372
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(1.834372);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy('$type', '$base');
-
- data = load 'input' as (val:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- data_ordered = ORDER data BY val;
- GENERATE Entropy(data_ordered);
- };
- store data_out into 'output';
- */
- @Multiline private String logEntropy;
-
- @Test
- public void dupValStreamingEmpiricalEntropoyLog2Test() throws Exception
- {
- PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log2");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > freqs=count/sum(count)
- * > library(entropy)
- * > entropy(count, freqs, c("ML"), c("log2"))
- * [1] 2.646439
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(2.646439);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- @Test
- public void dupValStreamingEmpiricalEntropoyLog10Test() throws Exception
- {
- PigTest test = createPigTestFromString(logEntropy, "type=empirical", "base=log10");
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "97.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- test.runScript();
-
- /* Add expected values, computed using R:
- *
- * e.g.
- *
- * > v=c(98.94791,38.61010,97.10575,62.28313,38.61010,32.05370,96.10962,38.61010,96.10962,20.41135)
- * > table(v)
- * v
- * 20.41135 28.72388 32.0537 38.6101 38.8396 62.28313 96.10962 96.65888 97.10575 98.94791
- * 1 1 3 1 2 1 1
- * > count=c(1,1,3,1,2,1,1)
- * > freqs=count/sum(count)
- * > library(entropy)
- * > entropy(count, freqs, c("ML"), c("log10"))
- * [1] 0.7966576
- *
- */
- List<Double> expectedOutput = new ArrayList<Double>();
- expectedOutput.add(0.7966576);
-
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- verifyEqualEntropyOutput(expectedOutput, output, 5);
- }
-
- /**
- register $JAR_PATH
-
- define Entropy datafu.pig.stats.entropy.StreamingEntropy();
-
- data = load 'input' as (val:double);
- --describe data;
- data_grouped = GROUP data ALL;
- data_out = FOREACH data_grouped {
- GENERATE Entropy(data);
- };
- store data_out into 'output';
- */
- @Multiline private String noOrderEntropy;
-
- @Test
- public void noOrderStreamingEmpiricalEntropoyTest() throws Exception
- {
- PigTest test = createPigTestFromString(noOrderEntropy);
-
- writeLinesToFile("input",
- "98.94791",
- "38.61010",
- "38.61010",
- "37.10575",
- "62.28313",
- "38.61010",
- "32.05370",
- "96.10962",
- "38.61010",
- "96.10962",
- "20.41135");
-
- try {
- test.runScript();
- List<Tuple> output = this.getLinesForAlias(test, "data_out");
- fail( "Testcase should fail");
- } catch(Exception ex) {
- }
- }
-}
|