http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/PigTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/PigTests.java b/test/pig/datafu/test/pig/PigTests.java
deleted file mode 100644
index 9b4eddd..0000000
--- a/test/pig/datafu/test/pig/PigTests.java
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig;
-
-import static org.testng.Assert.*;
-
-import java.io.BufferedInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileWriter;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.lang.reflect.Method;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.hadoop.metrics.jvm.JvmMetrics;
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.apache.pig.tools.parameters.ParseException;
-
-public abstract class PigTests
-{
- @org.testng.annotations.BeforeClass
- public void beforeClass()
- {
- // TODO make it configurable whether this happens, for travis-ci we can't spam the logs so much,
- // however otherwise it is useful to see the errors
- Logger.getRootLogger().removeAllAppenders();
- Logger.getLogger(JvmMetrics.class).setLevel(Level.OFF);
- }
-
- @org.testng.annotations.BeforeMethod
- public void beforeMethod(Method method)
- {
- System.out.println("\n*** Running " + method.getName() + " ***");
- }
-
- protected String[] getDefaultArgs()
- {
- String[] args = {
- "JAR_PATH=" + getJarPath()
- };
- return args;
- }
-
- protected List<String> getDefaultArgsAsList()
- {
- String[] args = getDefaultArgs();
- List<String> argsList = new ArrayList<String>(args.length);
- for (String arg : args)
- {
- argsList.add(arg);
- }
- return argsList;
- }
-
- protected PigTest createPigTestFromString(String str, String... args) throws IOException
- {
- return createPigTest(str.split("\n"),args);
- }
-
- protected PigTest createPigTest(String[] lines, String... args) throws IOException
- {
- // append args to list of default args
- List<String> theArgs = getDefaultArgsAsList();
- for (String arg : args)
- {
- theArgs.add(arg);
- }
-
- for (String arg : theArgs)
- {
- String[] parts = arg.split("=",2);
- if (parts.length == 2)
- {
- for (int i=0; i<lines.length; i++)
- {
- lines[i] = lines[i].replaceAll(Pattern.quote("$" + parts[0]), parts[1]);
- }
- }
- }
-
- return new PigTest(lines);
- }
-
- protected PigTest createPigTest(String scriptPath, String... args) throws IOException
- {
- return createPigTest(getLinesFromFile(scriptPath), args);
- }
-
- protected String getJarPath()
- {
- String jarDir = null;
-
- if (System.getProperty("datafu.jar.dir") != null)
- {
- jarDir = System.getProperty("datafu.jar.dir");
- }
- else
- {
- jarDir = new File(System.getProperty("user.dir"), "dist").getAbsolutePath();
- }
-
- File userDir = new File(jarDir);
-
- String[] files = userDir.list(new FilenameFilter() {
-
- @Override
- public boolean accept(File dir, String name)
- {
- return name.endsWith(".jar") && !name.contains("sources") && !name.contains("javadoc");
- }
-
- });
-
- if (files == null || files.length == 0)
- {
- throw new RuntimeException("Could not find JAR file");
- }
- else if (files.length > 1)
- {
- StringBuilder sb = new StringBuilder();
- for (String file : files)
- {
- sb.append(file);
- sb.append(",");
- }
- throw new RuntimeException("Found more JAR files than expected: " + sb.substring(0, sb.length()-1));
- }
-
- return userDir.getAbsolutePath() + "/" + files[0];
- }
-
- protected List<Tuple> getLinesForAlias(PigTest test, String alias) throws IOException, ParseException
- {
- return getLinesForAlias(test,alias,true);
- }
-
- protected List<Tuple> getLinesForAlias(PigTest test, String alias, boolean logValues) throws IOException, ParseException
- {
- Iterator<Tuple> tuplesIterator = test.getAlias(alias);
- List<Tuple> tuples = new ArrayList<Tuple>();
- if (logValues)
- {
- System.out.println(String.format("Values for %s: ", alias));
- }
- while (tuplesIterator.hasNext())
- {
- Tuple tuple = tuplesIterator.next();
- if (logValues)
- {
- System.out.println(tuple.toString());
- }
- tuples.add(tuple);
- }
- return tuples;
- }
-
- protected void writeLinesToFile(String fileName, String... lines) throws IOException
- {
- File inputFile = deleteIfExists(getFile(fileName));
- writeLinesToFile(inputFile, lines);
- }
-
- protected void writeLinesToFile(File file, String[] lines) throws IOException
- {
- FileWriter writer = new FileWriter(file);
- for (String line : lines)
- {
- writer.write(line + "\n");
- }
- writer.close();
- }
-
- protected void assertOutput(PigTest test, String alias, String... expected) throws IOException, ParseException
- {
- List<Tuple> tuples = getLinesForAlias(test, alias);
- assertEquals(expected.length, tuples.size());
- int i=0;
- for (String e : expected)
- {
- assertEquals(tuples.get(i++).toString(), e);
- }
- }
-
- protected File deleteIfExists(File file)
- {
- if (file.exists())
- {
- file.delete();
- }
- return file;
- }
-
- protected File getFile(String fileName)
- {
- return new File(System.getProperty("user.dir"), fileName).getAbsoluteFile();
- }
-
- /**
- * Gets the lines from a given file.
- *
- * @param relativeFilePath The path relative to the datafu-tests project.
- * @return The lines from the file
- * @throws IOException
- */
- protected String[] getLinesFromFile(String relativeFilePath) throws IOException
- {
- // assume that the working directory is the datafu-tests project
- File file = new File(System.getProperty("user.dir"), relativeFilePath).getAbsoluteFile();
- BufferedInputStream content = new BufferedInputStream(new FileInputStream(file));
- Object[] lines = IOUtils.readLines(content).toArray();
- String[] result = new String[lines.length];
- for (int i=0; i<lines.length; i++)
- {
- result[i] = (String)lines[i];
- }
- return result;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/bags/BagTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/bags/BagTests.java b/test/pig/datafu/test/pig/bags/BagTests.java
deleted file mode 100644
index 80bb0cc..0000000
--- a/test/pig/datafu/test/pig/bags/BagTests.java
+++ /dev/null
@@ -1,1254 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.bags;
-
-import static org.testng.Assert.assertEquals;
-
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import junit.framework.Assert;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.BagFactory;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.pig.bags.CountEach;
-import datafu.pig.bags.DistinctBy;
-import datafu.pig.bags.Enumerate;
-import datafu.test.pig.PigTests;
-
-
-public class BagTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define NullToEmptyBag datafu.pig.bags.NullToEmptyBag();
-
- data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
- dump data;
-
- data2 = FOREACH data GENERATE NullToEmptyBag(B) as P;
-
- dump data2;
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String nullToEmptyBag;
-
- @Test
- public void nullToEmptyBagTest() throws Exception
- {
- PigTest test = createPigTestFromString(nullToEmptyBag);
-
- writeLinesToFile("input",
- "({(1),(2),(3),(4),(5)})",
- "()",
- "{(4),(5)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(1),(2),(3),(4),(5)})",
- "({})",
- "({(4),(5)})");
- }
-
- /**
- register $JAR_PATH
-
- define EmptyBagToNull datafu.pig.bags.EmptyBagToNull();
-
- data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
- dump data;
-
- data2 = FOREACH data GENERATE EmptyBagToNull(B) as P;
-
- dump data2;
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String emptyBagToNullTest;
-
- @Test
- public void emptyBagToNullTest() throws Exception
- {
- PigTest test = createPigTestFromString(emptyBagToNullTest);
-
- writeLinesToFile("input",
- "({(1),(2),(3),(4),(5)})",
- "()",
- "({})",
- "{(4),(5)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(1),(2),(3),(4),(5)})",
- "()",
- "()",
- "({(4),(5)})");
- }
-
- /**
- register $JAR_PATH
-
- define EmptyBagToNullFields datafu.pig.bags.EmptyBagToNullFields();
-
- data = LOAD 'input' AS (B: bag {T: tuple(v1:INT,v2:INT)});
-
- dump data;
-
- data2 = FOREACH data GENERATE EmptyBagToNullFields(B) as P;
-
- dump data2;
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String emptyBagToNullFieldsTest;
-
- @Test
- public void emptyBagToNullFieldsTest() throws Exception
- {
- PigTest test = createPigTestFromString(emptyBagToNullFieldsTest);
-
- writeLinesToFile("input",
- "({(1,1),(2,2),(3,3),(4,4),(5,5)})",
- "({})",
- "{(4,4),(5,5)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(1,1),(2,2),(3,3),(4,4),(5,5)})",
- "({(,)})",
- "({(4,4),(5,5)})");
- }
-
- /**
- register $JAR_PATH
-
- define AppendToBag datafu.pig.bags.AppendToBag();
-
- data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-
- data2 = FOREACH data GENERATE key, AppendToBag(B,T) as B;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String appendToBagTest;
-
- @Test
- public void appendToBagTest() throws Exception
- {
- PigTest test = createPigTestFromString(appendToBagTest);
-
- writeLinesToFile("input",
- "1\t{(1),(2),(3)}\t(4)",
- "2\t{(10),(20),(30),(40),(50)}\t(60)");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "(1,{(1),(2),(3),(4)})",
- "(2,{(10),(20),(30),(40),(50),(60)})");
- }
-
- /**
- register $JAR_PATH
-
- define FirstTupleFromBag datafu.pig.bags.FirstTupleFromBag();
-
- data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)});
-
- data2 = FOREACH data GENERATE key, FirstTupleFromBag(B, null) as B;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String firstTupleFromBagTest;
-
- @Test
- public void firstTupleFromBagTest() throws Exception
- {
- PigTest test = createPigTestFromString(firstTupleFromBagTest);
-
- writeLinesToFile("input", "1\t{(4),(9),(16)}");
-
- test.runScript();
-
- assertOutput(test, "data2", "(1,(4))");
- }
-
- /**
- register $JAR_PATH
-
- define PrependToBag datafu.pig.bags.PrependToBag();
-
- data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-
- data2 = FOREACH data GENERATE key, PrependToBag(B,T) as B;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String prependToBagTest;
-
- @Test
- public void prependToBagTest() throws Exception
- {
- PigTest test = createPigTestFromString(prependToBagTest);
-
- writeLinesToFile("input",
- "1\t{(1),(2),(3)}\t(4)",
- "2\t{(10),(20),(30),(40),(50)}\t(60)");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "(1,{(4),(1),(2),(3)})",
- "(2,{(60),(10),(20),(30),(40),(50)})");
- }
-
- /**
- register $JAR_PATH
-
- define BagConcat datafu.pig.bags.BagConcat();
-
- data = LOAD 'input' AS (A: bag{T: tuple(v:INT)}, B: bag{T: tuple(v:INT)}, C: bag{T: tuple(v:INT)});
-
- describe data;
-
- data2 = FOREACH data GENERATE BagConcat(A,B,C);
-
- describe data2;
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String bagConcatTest;
-
- @Test
- public void bagConcatTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagConcatTest);
-
- writeLinesToFile("input",
- "({(1),(2),(3)}\t{(3),(5),(6)}\t{(10),(13)})",
- "({(2),(3),(4)}\t{(5),(5)}\t{(20)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(1),(2),(3),(3),(5),(6),(10),(13)})",
- "({(2),(3),(4),(5),(5),(20)})");
- }
-
- /**
- register $JAR_PATH
-
- define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-
- data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
- data2 = FOREACH data GENERATE UnorderedPairs(B) as P;
-
- data3 = FOREACH data2 GENERATE FLATTEN(P);
-
- data4 = FOREACH data3 GENERATE FLATTEN(elem1), FLATTEN(elem2);
-
- data5 = ORDER data4 BY $0, $1;
-
- STORE data5 INTO 'output';
-
-
- */
- @Multiline
- private String unorderedPairsTest;
-
- @Test
- public void unorderedPairsTest() throws Exception
- {
- PigTest test = createPigTestFromString(unorderedPairsTest);
-
- String[] input = {
- "{(1),(2),(3),(4),(5)}"
- };
-
- String[] output = {
- "(1,2)",
- "(1,3)",
- "(1,4)",
- "(1,5)",
- "(2,3)",
- "(2,4)",
- "(2,5)",
- "(3,4)",
- "(3,5)",
- "(4,5)"
- };
-
- test.assertOutput("data",input,"data4",output);
- }
-
- /**
- register $JAR_PATH
-
- define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-
- data = LOAD 'input' AS (A:int, B: bag {T: tuple(v:INT)});
-
- data2 = FOREACH data GENERATE A, UnorderedPairs(B) as P;
-
- data3 = FOREACH data2 GENERATE A, FLATTEN(P);
-
- STORE data3 INTO 'output';
-
- */
- @Multiline
- private String unorderedPairsTest2;
-
- @Test
- public void unorderedPairsTest2() throws Exception
- {
- PigTest test = createPigTestFromString(unorderedPairsTest2);
-
- this.writeLinesToFile("input", "1\t{(1),(2),(3),(4),(5)}");
-
- test.runScript();
- this.getLinesForAlias(test, "data3");
-
- this.assertOutput(test, "data3",
- "(1,(1),(2))",
- "(1,(1),(3))",
- "(1,(1),(4))",
- "(1,(1),(5))",
- "(1,(2),(3))",
- "(1,(2),(4))",
- "(1,(2),(5))",
- "(1,(3),(4))",
- "(1,(3),(5))",
- "(1,(4),(5))");
- }
-
- /**
- register $JAR_PATH
-
- define BagSplit datafu.pig.bags.BagSplit();
-
- data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-
- data2 = FOREACH data GENERATE BagSplit($MAX,B);
- --describe data2;
-
- data3 = FOREACH data2 GENERATE FLATTEN($0);
-
- --describe data3
-
- STORE data3 INTO 'output';
-
- */
- @Multiline
- private String bagSplitTest;
-
- @Test
- public void bagSplitTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagSplitTest,
- "MAX=5");
-
- writeLinesToFile("input",
- "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-
- test.runScript();
-
- assertOutput(test, "data3",
- "({(1,11),(2,22),(3,33),(4,44),(5,55)})",
- "({(6,66),(7,77),(8,88),(9,99),(10,1010)})",
- "({(11,1111),(12,1212)})");
- }
-
- /**
- register $JAR_PATH
-
- define BagSplit datafu.pig.bags.BagSplit('true');
-
- data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-
- data2 = FOREACH data GENERATE BagSplit($MAX,B);
-
- data3 = FOREACH data2 GENERATE FLATTEN($0);
-
- STORE data3 INTO 'output';
- */
- @Multiline
- private String bagSplitWithBagNumTest;
-
- @Test
- public void bagSplitWithBagNumTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagSplitWithBagNumTest,
- "MAX=10");
-
- writeLinesToFile("input",
- "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-
- test.runScript();
-
- assertOutput(test, "data3",
- "({(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010)},0)",
- "({(11,1111),(12,1212)},1)");
- }
-
- /**
- register $JAR_PATH
-
- define Enumerate datafu.pig.bags.ReverseEnumerate('1');
-
- data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
- data2 = FOREACH data GENERATE Enumerate(data);
- --describe data2;
-
- data3 = FOREACH data2 GENERATE FLATTEN($0);
- --describe data3;
-
- data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
- --describe data4;
-
- STORE data4 INTO 'output';
-
- */
- @Multiline
- private String enumerateWithReverseTest;
-
- @Test
- public void enumerateWithReverseTest() throws Exception
- {
- PigTest test = createPigTestFromString(enumerateWithReverseTest);
-
- writeLinesToFile("input",
- "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
- test.runScript();
-
- assertOutput(test, "data4",
- "(10,{(1),(2),(3)},5)",
- "(20,{(4),(5),(6)},4)",
- "(30,{(7),(8)},3)",
- "(40,{(9),(10),(11)},2)",
- "(50,{(12),(13),(14),(15)},1)");
- }
-
- /**
- register $JAR_PATH
-
- define Enumerate datafu.pig.bags.Enumerate('1');
-
- data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
- data2 = FOREACH data GENERATE Enumerate(data);
- --describe data2;
-
- data3 = FOREACH data2 GENERATE FLATTEN($0);
- --describe data3;
-
- data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
- --describe data4;
-
- STORE data4 INTO 'output';
-
- */
- @Multiline
- private String enumerateWithStartTest;
-
- @Test
- public void enumerateWithStartTest() throws Exception
- {
- PigTest test = createPigTestFromString(enumerateWithStartTest);
-
- writeLinesToFile("input",
- "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
- test.runScript();
-
- assertOutput(test, "data4",
- "(10,{(1),(2),(3)},1)",
- "(20,{(4),(5),(6)},2)",
- "(30,{(7),(8)},3)",
- "(40,{(9),(10),(11)},4)",
- "(50,{(12),(13),(14),(15)},5)");
- }
-
- /**
- register $JAR_PATH
-
- define Enumerate datafu.pig.bags.Enumerate();
-
- data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
- data2 = FOREACH data GENERATE Enumerate(data);
- --describe data2;
-
- data3 = FOREACH data2 GENERATE FLATTEN($0);
- --describe data3;
-
- data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
- --describe data4;
-
- STORE data4 INTO 'output';
-
- */
- @Multiline
- private String enumerateTest;
-
- @Test
- public void enumerateTest() throws Exception
- {
- PigTest test = createPigTestFromString(enumerateTest);
-
- writeLinesToFile("input",
- "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
- test.runScript();
-
- assertOutput(test, "data4",
- "(10,{(1),(2),(3)},0)",
- "(20,{(4),(5),(6)},1)",
- "(30,{(7),(8)},2)",
- "(40,{(9),(10),(11)},3)",
- "(50,{(12),(13),(14),(15)},4)");
- }
-
- @Test
- public void enumerateTest2() throws Exception
- {
- PigTest test = createPigTestFromString(enumerateTest);
-
- writeLinesToFile("input",
- "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})",
- "({(11,{(11),(12),(13),(14)}),(21,{(15),(16),(17),(18)}),(31,{(19),(20)}),(41,{(21),(22),(23),(24)}),(51,{(25),(26),(27)})})");
-
- test.runScript();
-
- assertOutput(test, "data4",
- "(10,{(1),(2),(3)},0)",
- "(20,{(4),(5),(6)},1)",
- "(30,{(7),(8)},2)",
- "(40,{(9),(10),(11)},3)",
- "(50,{(12),(13),(14),(15)},4)",
- "(11,{(11),(12),(13),(14)},0)",
- "(21,{(15),(16),(17),(18)},1)",
- "(31,{(19),(20)},2)",
- "(41,{(21),(22),(23),(24)},3)",
- "(51,{(25),(26),(27)},4)");
- }
-
- /*
- * Testing "Accumulator" part of Enumeration by manually invoking accumulate(), getValue() and cleanup()
- */
- @Test
- public void enumerateAccumulatorTest() throws Exception
- {
- Enumerate enumerate = new Enumerate();
-
- Tuple tuple1 = TupleFactory.getInstance().newTuple(1);
- tuple1.set(0, 10);
-
- Tuple tuple2 = TupleFactory.getInstance().newTuple(1);
- tuple2.set(0, 20);
-
- Tuple tuple3 = TupleFactory.getInstance().newTuple(1);
- tuple3.set(0, 30);
-
- Tuple tuple4 = TupleFactory.getInstance().newTuple(1);
- tuple4.set(0, 40);
-
- Tuple tuple5 = TupleFactory.getInstance().newTuple(1);
- tuple5.set(0, 50);
-
- DataBag bag1 = BagFactory.getInstance().newDefaultBag();
- bag1.add(tuple1);
- bag1.add(tuple2);
- bag1.add(tuple3);
-
- DataBag bag2 = BagFactory.getInstance().newDefaultBag();
- bag2.add(tuple4);
- bag2.add(tuple5);
-
- Tuple inputTuple1 = TupleFactory.getInstance().newTuple(1);
- inputTuple1.set(0,bag1);
-
- Tuple inputTuple2 = TupleFactory.getInstance().newTuple(1);
- inputTuple2.set(0,bag2);
-
- enumerate.accumulate(inputTuple1);
- enumerate.accumulate(inputTuple2);
- assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
-
- // Testing that cleanup code is correct by calling cleanup() and passing inputs back to Enumerate instance
- enumerate.cleanup();
- enumerate.accumulate(inputTuple1);
- enumerate.accumulate(inputTuple2);
- assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
- }
-
- /**
- register $JAR_PATH
-
- define BagSplit datafu.pig.bags.BagSplit();
- define Enumerate datafu.pig.bags.Enumerate('1');
-
- data = LOAD 'input' AS (data: bag {T: tuple(name:CHARARRAY, score:double)});
-
- data2 = FOREACH data GENERATE BagSplit(3,data) as the_bags;
-
- --describe data2
-
- data3 = FOREACH data2 GENERATE Enumerate(the_bags) as enumerated_bags;
-
- --describe data3
-
- data4 = FOREACH data3 GENERATE FLATTEN(enumerated_bags) as (data,i);
-
- --describe data4
-
- data5 = FOREACH data4 GENERATE data as the_data, i as the_key;
-
- --describe data5
-
- data_out = FOREACH data5 GENERATE FLATTEN(the_data), the_key;
-
- --describe data_out
- */
- @Multiline
- private String comprehensiveBagSplitAndEnumerate;
-
- @Test
- public void comprehensiveBagSplitAndEnumerate() throws Exception
- {
- PigTest test = createPigTestFromString(comprehensiveBagSplitAndEnumerate);
-
- writeLinesToFile("input",
- "({(A,1.0),(B,2.0),(C,3.0),(D,4.0),(E,5.0)})");
-
- test.runScript();
-
- assertOutput(test, "data_out",
- // bag #1
- "(A,1.0,1)",
- "(B,2.0,1)",
- "(C,3.0,1)",
- // bag #2
- "(D,4.0,2)",
- "(E,5.0,2)");
- }
-
- /**
- register $JAR_PATH
-
- define DistinctBy datafu.pig.bags.DistinctBy('0');
-
- data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:INT, c:INT)});
-
- data2 = FOREACH data GENERATE DistinctBy(data);
-
- --describe data2;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String distinctByTest;
-
- @Test
- public void distinctByTest() throws Exception
- {
- PigTest test = createPigTestFromString(distinctByTest);
-
- writeLinesToFile("input",
- "({(Z,1,0),(A,1,0),(A,1,0),(B,2,0),(B,22,1),(C,3,0),(D,4,0),(E,5,0)})",
- "({(A,10,2),(M,50,3),(A,34,49), (A,24,42), (Z,49,22),(B,1,1)},(B,2,2))");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})",
- "({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})");
- }
-
- /**
- register $JAR_PATH
-
- define DistinctBy datafu.pig.bags.DistinctBy('1', '2');
-
- data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:map[INT], c:bag{t: tuple(c0:CHARARRAY, c1:INT)})});
-
- data2 = FOREACH data GENERATE DistinctBy(data);
-
- --describe data2;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String distinctByMultiComplexFieldTest;
-
- @Test
- public void distinctByMultiComplexFieldTest() throws Exception
- {
- PigTest test = createPigTestFromString(distinctByMultiComplexFieldTest);
-
- writeLinesToFile("input",
- "({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})");
- }
-
- /**
- register $JAR_PATH
-
- define DistinctBy datafu.pig.bags.DistinctBy('1');
-
- data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:CHARARRAY)});
-
- data2 = FOREACH data GENERATE DistinctBy(data);
-
- --describe data2;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String distinctByDelimTest;
-
- @Test
- public void distinctByDelimTest() throws Exception
- {
- PigTest test = createPigTestFromString(distinctByDelimTest);
-
- writeLinesToFile("input",
- "({(a-b,c),(a-b,d)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(a-b,c),(a-b,d)})");
- }
-
- @Test
- public void distinctByExecTest() throws Exception
- {
- DistinctBy distinct = new DistinctBy("0");
-
- DataBag bag;
- Tuple input;
- Tuple data;
-
- bag = BagFactory.getInstance().newDefaultBag();
- input = TupleFactory.getInstance().newTuple(bag);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 10);
- data.set(1, 20);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 11);
- data.set(1, 50);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 10);
- data.set(1, 22);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 12);
- data.set(1, 40);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 11);
- data.set(1, 50);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 11);
- data.set(1, 51);
-
- DataBag result = distinct.exec(input);
-
- Assert.assertEquals(3, result.size());
-
- Iterator<Tuple> iter = result.iterator();
- Assert.assertEquals("(10,20)", iter.next().toString());
- Assert.assertEquals("(11,50)", iter.next().toString());
- Assert.assertEquals("(12,40)", iter.next().toString());
-
- // do it again to test cleanup
- bag = BagFactory.getInstance().newDefaultBag();
- input = TupleFactory.getInstance().newTuple(bag);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 12);
- data.set(1, 42);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 11);
- data.set(1, 51);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag.add(data);
- data.set(0, 11);
- data.set(1, 50);
-
- result = distinct.exec(input);
-
- Assert.assertEquals(2, result.size());
-
- iter = result.iterator();
- Assert.assertEquals("(12,42)", iter.next().toString());
- Assert.assertEquals("(11,51)", iter.next().toString());
- }
-
- @Test
- public void distinctByAccumulateTest() throws Exception
- {
- DistinctBy distinct = new DistinctBy("0");
-
- DataBag bag;
- Tuple input;
- Tuple data;
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 10);
- data.set(1, 20);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 11);
- data.set(1, 50);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 10);
- data.set(1, 22);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 12);
- data.set(1, 40);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 11);
- data.set(1, 50);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 11);
- data.set(1, 51);
- distinct.accumulate(input);
-
- DataBag result = distinct.getValue();
-
- Assert.assertEquals(3, result.size());
-
- Iterator<Tuple> iter = result.iterator();
- Assert.assertEquals("(10,20)", iter.next().toString());
- Assert.assertEquals("(11,50)", iter.next().toString());
- Assert.assertEquals("(12,40)", iter.next().toString());
-
- // do it again to test cleanup
- distinct.cleanup();
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 12);
- data.set(1, 42);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 11);
- data.set(1, 51);
- distinct.accumulate(input);
-
- data = TupleFactory.getInstance().newTuple(2);
- bag = BagFactory.getInstance().newDefaultBag();
- bag.add(data);
- input = TupleFactory.getInstance().newTuple(bag);
- data.set(0, 11);
- data.set(1, 50);
- distinct.accumulate(input);
-
- result = distinct.getValue();
-
- Assert.assertEquals(2, result.size());
-
- iter = result.iterator();
- Assert.assertEquals("(12,42)", iter.next().toString());
- Assert.assertEquals("(11,51)", iter.next().toString());
- }
-
- /**
- register $JAR_PATH
-
- define CountEach datafu.pig.bags.CountEach();
-
- data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-
- data2 = FOREACH data GENERATE CountEach(data) as counted;
- --describe data2;
-
- data3 = FOREACH data2 {
- ordered = ORDER counted BY count DESC;
- GENERATE ordered;
- }
- --describe data3
-
- STORE data3 INTO 'output';
-
- */
- @Multiline
- private String countEachTest;
-
- @Test
- public void countEachTest() throws Exception
- {
- PigTest test = createPigTestFromString(countEachTest);
-
- writeLinesToFile("input",
- "({(A),(B),(A),(C),(A),(B)})");
-
- test.runScript();
-
- assertOutput(test, "data3",
- "({((A),3),((B),2),((C),1)})");
- }
-
- @Test
- public void countEachExecAndAccumulateTest() throws Exception
- {
- for (int c=0; c<2; c++)
- {
- CountEach countEach = new CountEach("flatten");
-
- DataBag bag = BagFactory.getInstance().newDefaultBag();
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "A");
- bag.add(t);
- }
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "B");
- bag.add(t);
- }
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "B");
- bag.add(t);
- }
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "C");
- bag.add(t);
- }
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "A");
- bag.add(t);
- }
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, "D");
- bag.add(t);
- }
-
- DataBag output = null;
-
- if (c == 0)
- {
- Tuple input = TupleFactory.getInstance().newTuple(1);
- input.set(0, bag);
-
- System.out.println("Testing exec");
- output = countEach.exec(input);
- }
- else
- {
- System.out.println("Testing accumulate");
- for (Tuple t : bag)
- {
- DataBag tb = BagFactory.getInstance().newDefaultBag();
- tb.add(t);
- Tuple input = TupleFactory.getInstance().newTuple(1);
- input.set(0, tb);
- countEach.accumulate(input);
- }
-
- output = countEach.getValue();
-
- countEach.cleanup();
- Assert.assertEquals(0, countEach.getValue().size());
- }
-
- System.out.println(output.toString());
-
- Assert.assertEquals(4, output.size());
- Set<String> found = new HashSet<String>();
- for (Tuple t : output)
- {
- String key = (String)t.get(0);
- found.add(key);
- if (key == "A")
- {
- Assert.assertEquals(2, t.get(1));
- }
- else if (key == "B")
- {
- Assert.assertEquals(2, t.get(1));
- }
- else if (key == "C")
- {
- Assert.assertEquals(1, t.get(1));
- }
- else if (key == "D")
- {
- Assert.assertEquals(1, t.get(1));
- }
- else
- {
- Assert.fail("Unexpected: " + key);
- }
- }
- Assert.assertEquals(4, found.size());
- }
- }
-
- /**
- register $JAR_PATH
-
- define CountEach datafu.pig.bags.CountEach('flatten');
-
- data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-
- data2 = FOREACH data GENERATE CountEach(data) as counted;
- --describe data2;
-
- data3 = FOREACH data2 {
- ordered = ORDER counted BY count DESC;
- GENERATE ordered;
- }
- --describe data3
-
- STORE data3 INTO 'output';
-
- */
- @Multiline
- private String countEachFlattenTest;
-
- @Test
- public void countEachFlattenTest() throws Exception
- {
- PigTest test = createPigTestFromString(countEachFlattenTest);
-
- writeLinesToFile("input",
- "({(A),(B),(A),(C),(A),(B)})");
-
- test.runScript();
-
- assertOutput(test, "data3",
- "({(A,3),(B,2),(C,1)})");
- }
-
- /**
- register $JAR_PATH
-
- define BagLeftOuterJoin datafu.pig.bags.BagLeftOuterJoin();
-
- data = LOAD 'input' AS (outer_key:chararray, bag1:bag{T:tuple(k:chararray,v:chararray)}, bag2:bag{T:tuple(k:chararray,v:chararray)}, bag3:bag{T:tuple(k3:chararray,v3:chararray)});
- describe data;
-
- data2 = FOREACH data GENERATE
- outer_key,
- BagLeftOuterJoin(bag1, 'k', bag2, 'k', bag3, 'k3') as joined1,
- BagLeftOuterJoin(bag1, 'k', bag3, 'k3', bag2, 'k') as joined2; --this will break without UDF signature and pig < 0.11
- describe data2;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String bagLeftOuterJoinTest;
-
- @Test
- public void bagLeftOuterJoinTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagLeftOuterJoinTest);
-
- writeLinesToFile("input",
- "1\t{(K1,A1),(K2,B1),(K3,C1)}\t{(K1,A2),(K2,B2),(K2,B22)}\t{(K1,A3),(K3,C3),(K4,D3)}");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "(1,{(K1,A1,K1,A2,K1,A3),(K2,B1,K2,B2,,),(K2,B1,K2,B22,,),(K3,C1,,,K3,C3)},{(K1,A1,K1,A3,K1,A2),(K2,B1,,,K2,B2),(K2,B1,,,K2,B22),(K3,C1,K3,C3,,)})");
- }
-
- /**
- register $JAR_PATH
-
- define BagUnion datafu.pig.bags.BagConcat();
-
- data = LOAD 'input' AS (input_bag: bag {T: tuple(inner_bag: bag {T2: tuple(k: int, v: chararray)})});
- describe data;
-
- data2 = FOREACH data GENERATE BagUnion(input_bag) as unioned;
- describe data2;
-
- STORE data INTO 'output';
-
- */
- @Multiline
- private String bagUnionTest;
-
- @Test
- public void bagUnionTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagUnionTest);
- writeLinesToFile("input", "({({(1,A),(1,B)}),({(2,A),(2,B),(2,C)}),({(3,A)})}");
- test.runScript();
- assertOutput(test, "data2", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})");
- }
-
- /**
- register $JAR_PATH
-
- define BagGroup datafu.pig.bags.BagGroup();
-
- data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
- describe data;
-
- data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.k) as grouped;
- describe data2;
-
- data3 = FOREACH data2 {
- ordered = ORDER grouped BY group;
- GENERATE
- ordered as grouped;
- }
- describe data3;
-
- STORE data INTO 'output';
-
- */
- @Multiline
- private String bagGroupSingleTest;
-
- @Test
- public void bagGroupSingleTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagGroupSingleTest);
- writeLinesToFile("input", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})");
- test.runScript();
- getLinesForAlias(test, "data2", true);
- assertOutput(test, "data3", "({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})");
- }
-
- /**
- register $JAR_PATH
-
- define BagGroup datafu.pig.bags.BagGroup();
-
- data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, k2: chararray, v: int)});
- describe data;
-
- data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k, k2)) as grouped;
- describe data2;
-
- data3 = FOREACH data2 {
- ordered = ORDER grouped BY group;
- GENERATE
- ordered as grouped;
- }
- describe data3;
-
- STORE data INTO 'output';
-
- */
- @Multiline
- private String bagGroupMultipleTest;
-
- @Test
- public void bagGroupMultipleTest() throws Exception
- {
- PigTest test = createPigTestFromString(bagGroupMultipleTest);
- writeLinesToFile("input", "({(1,A,1),(1,B,1),(1,A,2),(2,A,1),(2,B,1),(2,C,1),(3,A,1)})");
- test.runScript();
- getLinesForAlias(test, "data2", true);
- assertOutput(test, "data3", "({((1,A),{(1,A,1),(1,A,2)}),((1,B),{(1,B,1)}),((2,A),{(2,A,1)}),((2,B),{(2,B,1)}),((2,C),{(2,C,1)}),((3,A),{(3,A,1)})})");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/geo/GeoTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/geo/GeoTests.java b/test/pig/datafu/test/pig/geo/GeoTests.java
deleted file mode 100644
index 39c7b61..0000000
--- a/test/pig/datafu/test/pig/geo/GeoTests.java
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.geo;
-
-import static org.testng.Assert.*;
-
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-public class GeoTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define HaversineDistInMiles datafu.pig.geo.HaversineDistInMiles();
-
- data = LOAD 'input' AS (lat1:double,lng1:double,lat2:double,lng2:double);
-
- data2 = FOREACH data GENERATE HaversineDistInMiles(lat1,lng1,lat2,lng2);
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String haversineTest;
-
- @Test
- public void haversineTest() throws Exception
- {
- PigTest test = createPigTestFromString(haversineTest);
-
- // Approximate latitude and longitude for major cities from maps.google.com
- double[] la = {34.040143,-118.243103};
- double[] tokyo = {35.637209,139.65271};
- double[] ny = {40.716038,-73.99498};
- double[] paris = {48.857713,2.342491};
- double[] sydney = {-33.872696,151.195221};
-
- this.writeLinesToFile("input",
- coords(la,tokyo),
- coords(ny,tokyo),
- coords(ny,sydney),
- coords(ny,paris));
-
- test.runScript();
-
- List<Tuple> distances = this.getLinesForAlias(test, "data2");
-
- // ensure distance is within 20 miles of expected (distances found online)
- assertWithin(5478.0, distances.get(0), 20.0); // la <-> tokyo
- assertWithin(6760.0, distances.get(1), 20.0); // ny <-> tokyo
- assertWithin(9935.0, distances.get(2), 20.0); // ny <-> sydney
- assertWithin(3635.0, distances.get(3), 20.0); // ny <-> paris
-
- }
-
- private void assertWithin(double expected, Tuple actual, double maxDiff) throws Exception
- {
- Double actualVal = (Double)actual.get(0);
- assertTrue(Math.abs(expected-actualVal) < maxDiff);
- }
-
- private String coords(double[] coords1, double[] coords2)
- {
- assertTrue(coords1.length == 2);
- assertTrue(coords2.length == 2);
- return String.format("%f\t%f\t%f\t%f", coords1[0], coords1[1], coords2[0], coords2[1]);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/hash/HashTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/hash/HashTests.java b/test/pig/datafu/test/pig/hash/HashTests.java
deleted file mode 100644
index ba19344..0000000
--- a/test/pig/datafu/test/pig/hash/HashTests.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.hash;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-public class HashTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define MD5 datafu.pig.hash.MD5();
-
- data_in = LOAD 'input' as (val:chararray);
-
- data_out = FOREACH data_in GENERATE MD5(val) as val;
-
- STORE data_out INTO 'output';
- */
- @Multiline private String md5Test;
-
- @Test
- public void md5Test() throws Exception
- {
- PigTest test = createPigTestFromString(md5Test);
-
- writeLinesToFile("input",
- "ladsljkasdglk",
- "lkadsljasgjskdjks",
- "aladlasdgjks");
-
- test.runScript();
-
- assertOutput(test, "data_out",
- "(d9a82575758bb4978949dc0659205cc6)",
- "(9ec37f02fae0d8d6a7f4453a62272f1f)",
- "(cb94139a8b9f3243e68a898ec6bd9b3d)");
- }
-
- /**
- register $JAR_PATH
-
- define SHA256 datafu.pig.hash.SHA('256');
-
- data_in = LOAD 'input' as (val:chararray);
-
- data_out = FOREACH data_in GENERATE SHA256(val) as val;
-
- STORE data_out INTO 'output';
- */
- @Multiline private String sha256Test;
-
- @Test
- public void sha256Test() throws Exception
- {
- PigTest test = createPigTestFromString(sha256Test);
-
- writeLinesToFile("input",
- "ladsljkasdglk",
- "lkadsljasgjskdjks",
- "aladlasdgjks");
-
- test.runScript();
-
- assertOutput(test, "data_out",
- "(70ebaf99c4d8ff8860869e50be2d46afbf150b883f66b50a76ee81cdc802242b)",
- "(f22e3c744a9ade0fa591d28c55392035248b391c9ee4c77ebfeaf6558c8c0dac)",
- "(49420b42e764178830783d4520aea56b759f325c1d1167f5640ded91f33f3e69)");
- }
-
- /**
- register $JAR_PATH
-
- define SHA512 datafu.pig.hash.SHA('512');
-
- data_in = LOAD 'input' as (val:chararray);
-
- data_out = FOREACH data_in GENERATE SHA512(val) as val;
-
- STORE data_out INTO 'output';
- */
- @Multiline private String sha512Test;
-
- @Test
- public void sha512Test() throws Exception
- {
- PigTest test = createPigTestFromString(sha512Test);
-
- writeLinesToFile("input",
- "ladsljkasdglk",
- "lkadsljasgjskdjks",
- "aladlasdgjks");
-
- test.runScript();
-
- assertOutput(test, "data_out",
- "(f681dbd89dfc9edf00f68107ed81b4b7c89abdf84337921785d13d9189937a43decbc264b5013d396a102b18564c39595732c43d6d4cc99473f6d6d7101ecf87)",
- "(85c130c8636c052e52a2ca091a92d0bb98ee361adcbeeebbd6af978a593b2486a22ac1e7352c683035cfa28de8eee3402adc6760ad54c5c7eda122c5124766bd)",
- "(3b82af5c08c9ab70523abf56db244eaa6740fa8c356e3a41bb5225560c0949b14b417c8d56e72cc26d5682400e0420a556e692c41ea82855013e8b7bae5fb0fb)");
- }
-
- /**
- register $JAR_PATH
-
- define MD5 datafu.pig.hash.MD5('base64');
-
- data_in = LOAD 'input' as (val:chararray);
-
- data_out = FOREACH data_in GENERATE MD5(val) as val;
-
- STORE data_out INTO 'output';
- */
- @Multiline private String md5Base64Test;
-
- @Test
- public void md5Base64Test() throws Exception
- {
- PigTest test = createPigTestFromString(md5Base64Test);
-
- writeLinesToFile("input",
- "ladsljkasdglk",
- "lkadsljasgjskdjks",
- "aladlasdgjks");
-
- test.runScript();
-
- assertOutput(test, "data_out",
- "(2agldXWLtJeJSdwGWSBcxg==)",
- "(nsN/Avrg2Nan9EU6YicvHw==)",
- "(y5QTmoufMkPmiomOxr2bPQ==)");
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/linkanalysis/PageRankImplTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/linkanalysis/PageRankImplTests.java b/test/pig/datafu/test/pig/linkanalysis/PageRankImplTests.java
deleted file mode 100644
index 01d540f..0000000
--- a/test/pig/datafu/test/pig/linkanalysis/PageRankImplTests.java
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.linkanalysis;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Map;
-
-import org.testng.annotations.Test;
-
-public class PageRankImplTests
-{
- @Test
- public void wikipediaGraphInMemoryTest() throws Exception {
- System.out.println();
- System.out.println("Starting wikipediaGraphInMemoryTest");
-
- datafu.pig.linkanalysis.PageRankImpl graph = new datafu.pig.linkanalysis.PageRankImpl();
-
- String[] edges = getWikiExampleEdges();
-
- Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges);
-
- // Without dangling node handling we will not get the true page rank since the total rank will
- // not add to 1.0. Without dangling node handling some of the page rank drains out of the graph.
- graph.enableDanglingNodeHandling();
-
- performIterations(graph, 150, 1e-18f);
-
- String[] expectedRanks = getWikiExampleExpectedRanks();
-
- Map<String,Float> expectedRanksMap = parseExpectedRanks(expectedRanks);
-
- validateExpectedRanks(graph, nodeIdsMap, expectedRanksMap);
- }
-
- @Test
- public void wikipediaGraphDiskCacheTest() throws Exception {
- System.out.println();
- System.out.println("Starting wikipediaGraphDiskCacheTest");
-
- datafu.pig.linkanalysis.PageRankImpl graph = new datafu.pig.linkanalysis.PageRankImpl();
-
- String[] edges = getWikiExampleEdges();
-
- graph.enableEdgeDiskCaching();
- graph.setEdgeCachingThreshold(5);
-
- Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges);
-
- assert graph.isUsingEdgeDiskCache() : "Expected disk cache to be used";
-
- // Without dangling node handling we will not get the true page rank since the total rank will
- // not add to 1.0. Without dangling node handling some of the page rank drains out of the graph.
- graph.enableDanglingNodeHandling();
-
- performIterations(graph, 150, 1e-18f);
-
- String[] expectedRanks = getWikiExampleExpectedRanks();
-
- Map<String,Float> expectedRanksMap = parseExpectedRanks(expectedRanks);
-
- validateExpectedRanks(graph, nodeIdsMap, expectedRanksMap);
- }
-
- @Test(groups="perf")
- public void hubAndSpokeInMemoryTest() throws Exception {
- System.out.println();
- System.out.println("Starting hubAndSpokeInMemoryTest");
-
- datafu.pig.linkanalysis.PageRankImpl graph = new datafu.pig.linkanalysis.PageRankImpl();
-
- String[] edges = getHubAndSpokeEdges();
-
- Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges);
-
- graph.enableDanglingNodeHandling();
-
- performIterations(graph, 150, 1e-18f);
-
- // no need to validate, this is just a perf test for runtime comparison
- }
-
- @Test(groups="perf")
- public void hubAndSpokeDiskCacheTest() throws Exception {
- System.out.println();
- System.out.println("Starting hubAndSpokeDiskCacheTest");
-
- datafu.pig.linkanalysis.PageRankImpl graph = new datafu.pig.linkanalysis.PageRankImpl();
-
- String[] edges = getHubAndSpokeEdges();
-
- graph.enableEdgeDiskCaching();
- graph.setEdgeCachingThreshold(5);
-
- Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges);
-
- graph.enableDanglingNodeHandling();
-
- performIterations(graph, 150, 1e-18f);
-
- // no need to validate, this is just a perf test for runtime comparison
- }
-
- private String[] getHubAndSpokeEdges()
- {
- int count = 50000;
- String[] edges = new String[count];
-
- for (int i=0; i<count; i++)
- {
- edges[i] = String.format("S%d H", i);
- }
- return edges;
- }
-
- public static String[] getWikiExampleEdges()
- {
- // graph taken from:
- // http://en.wikipedia.org/wiki/PageRank
- String[] edges = {
- "B C",
- "C B",
- "D A",
- "D B",
- "E D",
- "E B",
- "E F",
- "F E",
- "F B",
- "P1 B",
- "P1 E",
- "P2 B",
- "P2 E",
- "P3 B",
- "P3 E",
- "P4 E",
- "P5 E"
- };
- return edges;
- }
-
- public static String[] getWikiExampleExpectedRanks()
- {
- // these ranks come from the Wikipedia page:
- // http://en.wikipedia.org/wiki/PageRank
- String[] expectedRanks = {
- "A 3.3",
- "B 38.4",
- "C 34.3",
- "D 3.9",
- "E 8.1",
- "F 3.9",
- "P1 1.6",
- "P2 1.6",
- "P3 1.6",
- "P4 1.6",
- "P5 1.6"
- };
- return expectedRanks;
- }
-
- private Map<String,Integer> loadGraphFromEdgeList(datafu.pig.linkanalysis.PageRankImpl graph, String[] edges) throws IOException
- {
- Map<Integer,ArrayList<Map<String,Object>>> nodeEdgesMap = new HashMap<Integer,ArrayList<Map<String,Object>>>();
- Map<String,Integer> nodeIdsMap = new HashMap<String,Integer>();
-
- for (String edge : edges)
- {
- String[] parts = edge.split(" ");
- assert parts.length == 2 : "Expected two parts";
-
- int sourceId = getOrCreateId(parts[0], nodeIdsMap);
- int destId = getOrCreateId(parts[1], nodeIdsMap);
-
- Map<String,Object> edgeMap = new HashMap<String,Object>();
- edgeMap.put("weight", 1.0);
- edgeMap.put("dest", destId);
-
- ArrayList<Map<String,Object>> nodeEdges = null;
-
- if (nodeEdgesMap.containsKey(sourceId))
- {
- nodeEdges = nodeEdgesMap.get(sourceId);
- }
- else
- {
- nodeEdges = new ArrayList<Map<String,Object>>();
- nodeEdgesMap.put(sourceId, nodeEdges);
- }
-
- nodeEdges.add(edgeMap);
- }
-
- for (Map.Entry<Integer, ArrayList<Map<String,Object>>> e : nodeEdgesMap.entrySet())
- {
- graph.addNode(e.getKey(), e.getValue());
- }
-
- return nodeIdsMap;
- }
-
- private void performIterations(datafu.pig.linkanalysis.PageRankImpl graph, int maxIters, float tolerance) throws IOException
- {
- System.out.println(String.format("Beginning iteration (maxIters = %d, tolerance=%e)", maxIters, tolerance));
-
- System.out.println("Initializing graph");
- long startTime = System.nanoTime();
- graph.init();
- System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6));
-
- float totalDiff;
- int iter = 0;
-
- System.out.println("Beginning iterations");
- startTime = System.nanoTime();
- do
- {
- totalDiff = graph.nextIteration();
- iter++;
- } while(iter < maxIters && totalDiff > tolerance);
- System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6));
- }
-
- private void validateExpectedRanks(datafu.pig.linkanalysis.PageRankImpl graph, Map<String,Integer> nodeIds, Map<String,Float> expectedRanks)
- {
- System.out.println("Validating page rank results");
-
- for (Map.Entry<String,Integer> e : nodeIds.entrySet())
- {
- float rank = graph.getNodeRank(e.getValue());
-
- float expectedRank = expectedRanks.get(e.getKey());
- // require 0.1% accuracy
- assert (Math.abs(expectedRank - rank*100.0f) < 0.1) : String.format("Did not get expected rank for %s", e.getKey());
- }
-
- System.out.println("All ranks match expected");
- }
-
- public static Map<String,Float> parseExpectedRanks(String[] expectedRanks)
- {
- Map<String,Float> expectedRanksMap = new HashMap<String,Float>();
- for (String expectedRankString : expectedRanks)
- {
- String[] parts = expectedRankString.split(" ");
- assert parts.length == 2 : "Expected two parts";
- String name = parts[0];
- Float expectedRank = Float.parseFloat(parts[1]);
- expectedRanksMap.put(name, expectedRank);
- }
- return expectedRanksMap;
- }
-
- private Integer getOrCreateId(String name, Map<String,Integer> nodeIds)
- {
- if (nodeIds.containsKey(name))
- {
- return nodeIds.get(name);
- }
- else
- {
- Integer id = nodeIds.size();
- nodeIds.put(name, id);
- return id;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/linkanalysis/PageRankTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/linkanalysis/PageRankTests.java b/test/pig/datafu/test/pig/linkanalysis/PageRankTests.java
deleted file mode 100644
index 371d2cc..0000000
--- a/test/pig/datafu/test/pig/linkanalysis/PageRankTests.java
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.linkanalysis;
-
-
-import static org.testng.Assert.*;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-
-import datafu.test.pig.linkanalysis.PageRankImplTests;
-import datafu.test.pig.PigTests;
-
-public class PageRankTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- -- Need to enable dangling node handling since the Wikipedia example has them,
- -- otherwise the ranks won't be right.
- define PageRank datafu.pig.linkanalysis.PageRank('dangling_nodes','true');
-
- data = LOAD 'input' AS (topic:INT,source:INT,dest:INT,weight:DOUBLE);
-
- data_grouped = GROUP data by (topic,source);
-
- data_grouped = foreach data_grouped {
- generate group.topic as topic, group.source as source, data.(dest,weight) as edges;
- };
-
- data_grouped2 = GROUP data_grouped by topic;
- data_grouped2 = foreach data_grouped2 {
- generate group as topic, FLATTEN(PageRank(data_grouped.(source,edges))) as (source,rnk);
- };
-
- data_grouped3 = FOREACH data_grouped2 GENERATE
- topic,
- source,
- rnk;
-
- STORE data_grouped3 INTO 'output';
-
-
- */
- @Multiline private String pageRankTest;
-
- @Test
- public void pigPageRankTest() throws Exception
- {
- PigTest test = createPigTestFromString(pageRankTest);
-
- String[] edges = PageRankImplTests.getWikiExampleEdges();
-
- Map<String,Integer> nodeIds = new HashMap<String,Integer>();
- Map<Integer,String> nodeIdsReversed = new HashMap<Integer,String>();
- Map<String,Float> expectedRanks = PageRankImplTests.parseExpectedRanks(PageRankImplTests.getWikiExampleExpectedRanks());
-
- File f = new File(System.getProperty("user.dir"), "input").getAbsoluteFile();
- if (f.exists())
- {
- f.delete();
- }
-
- FileWriter writer = new FileWriter(f);
- BufferedWriter bufferedWriter = new BufferedWriter(writer);
-
- for (String edge : edges)
- {
- String[] edgeParts = edge.split(" ");
- String source = edgeParts[0];
- String dest = edgeParts[1];
- if (!nodeIds.containsKey(source))
- {
- int id = nodeIds.size();
- nodeIds.put(source,id);
- nodeIdsReversed.put(id, source);
- }
- if (!nodeIds.containsKey(dest))
- {
- int id = nodeIds.size();
- nodeIds.put(dest,id);
- nodeIdsReversed.put(id, dest);
- }
- Integer sourceId = nodeIds.get(source);
- Integer destId = nodeIds.get(dest);
-
- StringBuffer sb = new StringBuffer();
-
- sb.append("1\t"); // topic
- sb.append(sourceId.toString() + "\t");
- sb.append(destId.toString() + "\t");
- sb.append("1.0\n"); // weight
-
- bufferedWriter.write(sb.toString());
- }
-
- bufferedWriter.close();
-
- test.runScript();
- Iterator<Tuple> tuples = test.getAlias("data_grouped3");
-
- System.out.println("Final node ranks:");
- int nodeCount = 0;
- while (tuples.hasNext())
- {
- Tuple nodeTuple = tuples.next();
-
- Integer topic = (Integer)nodeTuple.get(0);
- Integer nodeId = (Integer)nodeTuple.get(1);
- Float nodeRank = (Float)nodeTuple.get(2);
-
- assertEquals(1, topic.intValue());
-
- System.out.println(String.format("%d => %f", nodeId, nodeRank));
-
- Float expectedNodeRank = expectedRanks.get(nodeIdsReversed.get(nodeId));
-
- assertTrue(Math.abs(expectedNodeRank - nodeRank * 100.0f) < 0.1,
- String.format("expected: %f, actual: %f", expectedNodeRank, nodeRank));
-
- nodeCount++;
- }
-
- assertEquals(nodeIds.size(),nodeCount);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/random/NumberTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/random/NumberTests.java b/test/pig/datafu/test/pig/random/NumberTests.java
deleted file mode 100644
index 867c27c..0000000
--- a/test/pig/datafu/test/pig/random/NumberTests.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.random;
-
-import static org.testng.Assert.*;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.test.pig.PigTests;
-
-public class NumberTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define RandInt datafu.pig.random.RandInt();
-
- data = LOAD 'input' AS (key:INT);
- data2 = FOREACH data GENERATE key, RandInt($MIN,$MAX) as val;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline private String randomIntRangeTest;
-
- /**
- * Test the RandomIntRange UDF. The main purpose is to make sure it can be used in a Pig script.
- * Also the range of output values is tested.
- *
- * @throws Exception
- */
- @Test
- public void randomIntRangeTest() throws Exception
- {
- PigTest test = createPigTestFromString(randomIntRangeTest,
- "MIN=1", "MAX=10");
-
- List<String> input = new ArrayList<String>();
- for (int i=0; i<100; i++)
- {
- input.add(String.format("(%d)", i));
- }
-
- writeLinesToFile("input",
- input.toArray(new String[0]));
-
- test.runScript();
-
- List<Tuple> tuples = getLinesForAlias(test, "data2", false);
- for (Tuple tuple : tuples)
- {
- Integer randValue = (Integer)tuple.get(1);
- assertTrue(randValue >= 1);
- assertTrue(randValue <= 10);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/random/UUIDTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/random/UUIDTests.java b/test/pig/datafu/test/pig/random/UUIDTests.java
deleted file mode 100644
index e199760..0000000
--- a/test/pig/datafu/test/pig/random/UUIDTests.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package datafu.test.pig.random;
-
-import datafu.test.pig.PigTests;
-import junit.framework.Assert;
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import java.util.*;
-
-import static org.testng.Assert.assertTrue;
-
-public class UUIDTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define RandomUUID datafu.pig.random.RandomUUID();
-
- data = LOAD 'input' AS (key: chararray);
- DUMP data
-
- data2 = FOREACH data GENERATE key, RandomUUID() as val;
- DUMP data2
-
- STORE data2 INTO 'output';
- */
- @Multiline private String randomUUIDTest;
-
- /**
- * Test the RandomUUID UDF. The main purpose is to make sure it can be used in a Pig script.
- * Also the range of length of values is tested.
- *
- * @throws Exception
- */
- @Test
- public void randomUUIDTest() throws Exception
- {
- PigTest test = createPigTestFromString(randomUUIDTest);
-
- writeLinesToFile("input",
- "input1",
- "input2",
- "input3");
-
- List<Tuple> tuples = getLinesForAlias(test, "data2", true);
- Set<UUID> set = new HashSet<UUID>();
- for (Tuple tuple : tuples)
- {
- set.add(UUID.fromString((String)tuple.get(1)));
- }
- Assert.assertEquals(set.size(), 3);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/778bef1e/test/pig/datafu/test/pig/sampling/SamplingTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/sampling/SamplingTests.java b/test/pig/datafu/test/pig/sampling/SamplingTests.java
deleted file mode 100644
index 9209133..0000000
--- a/test/pig/datafu/test/pig/sampling/SamplingTests.java
+++ /dev/null
@@ -1,523 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package datafu.test.pig.sampling;
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import junit.framework.Assert;
-
-import org.adrianwalker.multilinestring.Multiline;
-import org.apache.pig.backend.executionengine.ExecException;
-import org.apache.pig.data.BagFactory;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-import org.apache.pig.pigunit.PigTest;
-import org.testng.annotations.Test;
-
-import datafu.pig.sampling.ReservoirSample;
-import datafu.pig.sampling.SampleByKey;
-import datafu.pig.sampling.WeightedSample;
-import datafu.test.pig.PigTests;
-
-
-public class SamplingTests extends PigTests
-{
- /**
- register $JAR_PATH
-
- define WeightedSample datafu.pig.sampling.WeightedSample('1');
-
- data = LOAD 'input' AS (A: bag {T: tuple(v1:chararray,v2:INT)});
-
- data2 = FOREACH data GENERATE WeightedSample(A,1);
- --describe data2;
-
- STORE data2 INTO 'output';
-
- */
- @Multiline
- private String weightedSampleTest;
-
- @Test
- public void weightedSampleTest() throws Exception
- {
- PigTest test = createPigTestFromString(weightedSampleTest);
-
- writeLinesToFile("input",
- "({(a, 100),(b, 1),(c, 5),(d, 2)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(a,100),(c,5),(b,1),(d,2)})");
- }
-
- /**
- register $JAR_PATH
-
- define WeightedSample datafu.pig.sampling.WeightedSample('1');
-
- data = LOAD 'input' AS (A: bag {T: tuple(v1:chararray,v2:INT)});
-
- data2 = FOREACH data GENERATE WeightedSample(A,1,3);
- --describe data2;
-
- STORE data2 INTO 'output';
- */
- @Multiline
- private String weightedSampleLimitTest;
-
- @Test
- public void weightedSampleLimitTest() throws Exception
- {
- PigTest test = createPigTestFromString(weightedSampleLimitTest);
-
- writeLinesToFile("input",
- "({(a, 100),(b, 1),(c, 5),(d, 2)})");
-
- test.runScript();
-
- assertOutput(test, "data2",
- "({(a,100),(c,5),(b,1)})");
- }
-
- @Test
- public void weightedSampleLimitExecTest() throws IOException
- {
- WeightedSample sampler = new WeightedSample();
-
- DataBag bag = BagFactory.getInstance().newDefaultBag();
- for (int i=0; i<100; i++)
- {
- Tuple t = TupleFactory.getInstance().newTuple(2);
- t.set(0, i);
- t.set(1, 1); // score is equal for all
- bag.add(t);
- }
-
- Tuple input = TupleFactory.getInstance().newTuple(3);
- input.set(0, bag);
- input.set(1, 1); // use index 1 for score
- input.set(2, 10); // get 10 items
-
- DataBag result = sampler.exec(input);
-
- Assert.assertEquals(10, result.size());
-
- // all must be found, no repeats
- Set<Integer> found = new HashSet<Integer>();
- for (Tuple t : result)
- {
- Integer i = (Integer)t.get(0);
- System.out.println(i);
- Assert.assertTrue(i>=0 && i<100);
- Assert.assertFalse(String.format("Found duplicate of %d",i), found.contains(i));
- found.add(i);
- }
- }
-
- /**
- register $JAR_PATH
-
- DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5', 'salt2.5');
-
- data = LOAD 'input' AS (A_id:chararray, B_id:chararray, C:int);
- sampled = FILTER data BY SampleByKey(A_id);
-
- STORE sampled INTO 'output';
-
- */
- @Multiline
- private String sampleByKeyTest;
-
- @Test
- public void sampleByKeyTest() throws Exception
- {
- PigTest test = createPigTestFromString(sampleByKeyTest);
-
- writeLinesToFile("input",
- "A1\tB1\t1","A1\tB1\t4","A1\tB3\t4","A1\tB4\t4",
- "A2\tB1\t4","A2\tB2\t4",
- "A3\tB1\t3","A3\tB1\t1","A3\tB3\t77",
- "A4\tB1\t3","A4\tB2\t3","A4\tB3\t59","A4\tB4\t29",
- "A5\tB1\t4",
- "A6\tB2\t3","A6\tB2\t55","A6\tB3\t1",
- "A7\tB1\t39","A7\tB2\t27","A7\tB3\t85",
- "A8\tB1\t4","A8\tB2\t45",
- "A9\tB3\t92", "A9\tB1\t42","A9\tB2\t1","A9\tB3\t0",
- "A10\tB1\t7","A10\tB2\t23","A10\tB3\t1","A10\tB4\t41","A10\tB5\t52");
-
- test.runScript();
- assertOutput(test, "sampled",
- "(A4,B1,3)","(A4,B2,3)","(A4,B3,59)","(A4,B4,29)",
- "(A5,B1,4)",
- "(A6,B2,3)","(A6,B2,55)","(A6,B3,1)",
- "(A7,B1,39)","(A7,B2,27)","(A7,B3,85)",
- "(A10,B1,7)","(A10,B2,23)","(A10,B3,1)","(A10,B4,41)","(A10,B5,52)");
- }
-
- /**
- register $JAR_PATH
-
- DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5', 'salt2.5');
-
- data = LOAD 'input' AS (A_id:chararray, B_id:chararray, C:int);
- sampled = FILTER data BY SampleByKey(A_id, B_id);
-
- STORE sampled INTO 'output';
-
- */
- @Multiline
- private String sampleByKeyMultipleKeyTest;
-
- @Test
- public void sampleByKeyMultipleKeyTest() throws Exception
- {
- PigTest test = createPigTestFromString(sampleByKeyMultipleKeyTest);
-
- writeLinesToFile("input",
- "A1\tB1\t1","A1\tB1\t4",
- "A1\tB3\t4",
- "A1\tB4\t4",
- "A2\tB1\t4",
- "A2\tB2\t4",
- "A3\tB1\t3","A3\tB1\t1",
- "A3\tB3\t77",
- "A4\tB1\t3",
- "A4\tB2\t3",
- "A4\tB3\t59",
- "A4\tB4\t29",
- "A5\tB1\t4",
- "A6\tB2\t3","A6\tB2\t55",
- "A6\tB3\t1",
- "A7\tB1\t39",
- "A7\tB2\t27",
- "A7\tB3\t85",
- "A8\tB1\t4",
- "A8\tB2\t45",
- "A9\tB3\t92","A9\tB3\t0",
- "A9\tB6\t42","A9\tB5\t1",
- "A10\tB1\t7",
- "A10\tB2\t23","A10\tB2\t1","A10\tB2\t31",
- "A10\tB6\t41",
- "A10\tB7\t52");
- test.runScript();
- assertOutput(test, "sampled",
- "(A1,B1,1)","(A1,B1,4)",
- "(A1,B4,4)",
- "(A2,B1,4)",
- "(A2,B2,4)",
- "(A3,B1,3)","(A3,B1,1)",
- "(A4,B4,29)",
- "(A5,B1,4)",
- "(A6,B3,1)",
- "(A7,B1,39)",
- "(A8,B1,4)",
- "(A9,B3,92)","(A9,B3,0)",
- "(A10,B2,23)","(A10,B2,1)","(A10,B2,31)"
- );
-
- }
-
- @Test
- public void sampleByKeyExecTest() throws Exception
- {
- SampleByKey sampler = new SampleByKey("0.10", "thesalt");
-
- Map<Integer,Integer> valuesPerKey = new HashMap<Integer,Integer>();
-
- // 10,000 keys total
- for (int i=0; i<10000; i++)
- {
- // 5 values per key
- for (int j=0; j<5; j++)
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, i);
- if (sampler.exec(t))
- {
- if (valuesPerKey.containsKey(i))
- {
- valuesPerKey.put(i, valuesPerKey.get(i)+1);
- }
- else
- {
- valuesPerKey.put(i, 1);
- }
- }
- }
- }
-
- // 10% sample, so should have roughly 1000 keys
- Assert.assertTrue(Math.abs(1000-valuesPerKey.size()) < 50);
-
- // every value should be present for the same key
- for (Map.Entry<Integer, Integer> pair : valuesPerKey.entrySet())
- {
- Assert.assertEquals(5, pair.getValue().intValue());
- }
- }
-
- /**
- register $JAR_PATH
-
- DEFINE ReservoirSample datafu.pig.sampling.ReservoirSample('$RESERVOIR_SIZE');
-
- data = LOAD 'input' AS (A_id:chararray, B_id:chararray, C:int);
- sampled = FOREACH (GROUP data ALL) GENERATE ReservoirSample(data) as sample_data;
- sampled = FOREACH sampled GENERATE COUNT(sample_data) AS sample_count;
- STORE sampled INTO 'output';
-
- */
- @Multiline
- private String reservoirSampleTest;
-
- @Test
- public void reservoirSampleTest() throws Exception
- {
-
- writeLinesToFile("input",
- "A1\tB1\t1",
- "A1\tB1\t4",
- "A1\tB3\t4",
- "A1\tB4\t4",
- "A2\tB1\t4",
- "A2\tB2\t4",
- "A3\tB1\t3",
- "A3\tB1\t1",
- "A3\tB3\t77",
- "A4\tB1\t3",
- "A4\tB2\t3",
- "A4\tB3\t59",
- "A4\tB4\t29",
- "A5\tB1\t4",
- "A6\tB2\t3",
- "A6\tB2\t55",
- "A6\tB3\t1",
- "A7\tB1\t39",
- "A7\tB2\t27",
- "A7\tB3\t85",
- "A8\tB1\t4",
- "A8\tB2\t45",
- "A9\tB3\t92",
- "A9\tB3\t0",
- "A9\tB6\t42",
- "A9\tB5\t1",
- "A10\tB1\t7",
- "A10\tB2\t23",
- "A10\tB2\t1",
- "A10\tB2\t31",
- "A10\tB6\t41",
- "A10\tB7\t52");
-
- for(int i=10; i<=30; i=i+10){
- int reservoirSize = i ;
- PigTest test = createPigTestFromString(reservoirSampleTest, "RESERVOIR_SIZE="+reservoirSize);
- test.runScript();
- assertOutput(test, "sampled", "("+reservoirSize+")");
- }
- }
-
- /**
- register $JAR_PATH
-
- DEFINE ReservoirSample datafu.pig.sampling.ReservoirSample('$RESERVOIR_SIZE');
- DEFINE Assert datafu.pig.util.Assert();
-
- data = LOAD 'input' AS (A_id:int, B_id:chararray, C:int);
- sampled = FOREACH (GROUP data BY A_id) GENERATE group as A_id, ReservoirSample(data.(B_id,C)) as sample_data;
- sampled = FILTER sampled BY Assert((SIZE(sample_data) <= $RESERVOIR_SIZE ? 1 : 0), 'must be <= $RESERVOIR_SIZE');
- sampled = FOREACH sampled GENERATE A_id, FLATTEN(sample_data);
- STORE sampled INTO 'output';
-
- */
- @Multiline
- private String reservoirSampleGroupTest;
-
- /**
- * Verifies that ReservoirSample works when data grouped by a key.
- * In particular it ensures that the reservoir is not reused across keys.
- *
- * <p>
- * This confirms the fix for DATAFU-11.
- * </p>
- *
- * @throws Exception
- */
- @Test
- public void reservoirSampleGroupTest() throws Exception
- {
- // first value is the key. last value matches the key so we can
- // verify the register is reset for each key. values should not
- // bleed across to other keys.
- writeLinesToFile("input",
- "1\tB1\t1",
- "1\tB1\t1",
- "1\tB3\t1",
- "1\tB4\t1",
- "2\tB1\t2",
- "2\tB2\t2",
- "3\tB1\t3",
- "3\tB1\t3",
- "3\tB3\t3",
- "4\tB1\t4",
- "4\tB2\t4",
- "4\tB3\t4",
- "4\tB4\t4",
- "5\tB1\t5",
- "6\tB2\t6",
- "6\tB2\t6",
- "6\tB3\t6",
- "7\tB1\t7",
- "7\tB2\t7",
- "7\tB3\t7",
- "8\tB1\t8",
- "8\tB2\t8",
- "9\tB3\t9",
- "9\tB3\t9",
- "9\tB6\t9",
- "9\tB5\t9",
- "10\tB1\t10",
- "10\tB2\t10",
- "10\tB2\t10",
- "10\tB2\t10",
- "10\tB6\t10",
- "10\tB7\t10");
-
- for(int i=1; i<=3; i++) {
- int reservoirSize = i ;
- PigTest test = createPigTestFromString(reservoirSampleGroupTest, "RESERVOIR_SIZE="+reservoirSize);
- test.runScript();
-
- List<Tuple> tuples = getLinesForAlias(test, "sampled");
-
- for (Tuple tuple : tuples)
- {
- Assert.assertEquals(((Number)tuple.get(0)).intValue(), ((Number)tuple.get(2)).intValue());
- }
- }
- }
-
- @Test
- public void reservoirSampleExecTest() throws IOException
- {
- ReservoirSample sampler = new ReservoirSample("10");
-
- DataBag bag = BagFactory.getInstance().newDefaultBag();
- for (int i=0; i<100; i++)
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, i);
- bag.add(t);
- }
-
- Tuple input = TupleFactory.getInstance().newTuple(bag);
-
- DataBag result = sampler.exec(input);
-
- Assert.assertEquals(10, result.size());
-
- // all must be found, no repeats
- Set<Integer> found = new HashSet<Integer>();
- for (Tuple t : result)
- {
- Integer i = (Integer)t.get(0);
- System.out.println(i);
- Assert.assertTrue(i>=0 && i<100);
- Assert.assertFalse(String.format("Found duplicate of %d",i), found.contains(i));
- found.add(i);
- }
- }
-
- @Test
- public void reservoirSampleAccumulateTest() throws IOException
- {
- ReservoirSample sampler = new ReservoirSample("10");
-
- for (int i=0; i<100; i++)
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, i);
- DataBag bag = BagFactory.getInstance().newDefaultBag();
- bag.add(t);
- Tuple input = TupleFactory.getInstance().newTuple(bag);
- sampler.accumulate(input);
- }
-
- DataBag result = sampler.getValue();
-
- Assert.assertEquals(10, result.size());
-
- // all must be found, no repeats
- Set<Integer> found = new HashSet<Integer>();
- for (Tuple t : result)
- {
- Integer i = (Integer)t.get(0);
- System.out.println(i);
- Assert.assertTrue(i>=0 && i<100);
- Assert.assertFalse(String.format("Found duplicate of %d",i), found.contains(i));
- found.add(i);
- }
- }
-
- @Test
- public void reservoirSampleAlgebraicTest() throws IOException
- {
- ReservoirSample.Initial initialSampler = new ReservoirSample.Initial("10");
- ReservoirSample.Intermediate intermediateSampler = new ReservoirSample.Intermediate("10");
- ReservoirSample.Final finalSampler = new ReservoirSample.Final("10");
-
- DataBag bag = BagFactory.getInstance().newDefaultBag();
- for (int i=0; i<100; i++)
- {
- Tuple t = TupleFactory.getInstance().newTuple(1);
- t.set(0, i);
- bag.add(t);
- }
-
- Tuple input = TupleFactory.getInstance().newTuple(bag);
-
- Tuple intermediateTuple = initialSampler.exec(input);
- DataBag intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
- intermediateTuple = intermediateSampler.exec(TupleFactory.getInstance().newTuple(intermediateBag));
- intermediateBag = BagFactory.getInstance().newDefaultBag(Arrays.asList(intermediateTuple));
- DataBag result = finalSampler.exec(TupleFactory.getInstance().newTuple(intermediateBag));
-
- Assert.assertEquals(10, result.size());
-
- // all must be found, no repeats
- Set<Integer> found = new HashSet<Integer>();
- for (Tuple t : result)
- {
- Integer i = (Integer)t.get(0);
- System.out.println(i);
- Assert.assertTrue(i>=0 && i<100);
- Assert.assertFalse(String.format("Found duplicate of %d",i), found.contains(i));
- found.add(i);
- }
- }
-}
|