datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject [1/2] datafu git commit: DATAFU-130 Add three-way left outer join macro
Date Mon, 07 Jan 2019 15:03:10 GMT
Repository: datafu
Updated Branches:
  refs/heads/master f7ec4c7ad -> 0d0842719


DATAFU-130 Add three-way left outer join macro

Signed-off-by: matthew.hayes <mhayes@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/datafu/commit/e81ea866
Tree: http://git-wip-us.apache.org/repos/asf/datafu/tree/e81ea866
Diff: http://git-wip-us.apache.org/repos/asf/datafu/diff/e81ea866

Branch: refs/heads/master
Commit: e81ea8666467fb89ceba37465e5905b9674633f9
Parents: 857cf16
Author: Eyal Allweil <eyal@apache.org>
Authored: Wed Nov 28 11:50:22 2018 +0200
Committer: matthew.hayes <mhayes@apache.org>
Committed: Thu Jan 3 09:29:14 2019 -0800

----------------------------------------------------------------------
 .../src/main/resources/datafu/count_macros.pig  | 11 +++++
 .../src/main/resources/datafu/diff_macros.pig   |  8 ++++
 .../main/resources/datafu/left_outer_join.pig   | 39 ++++++++++++++++++
 .../main/resources/datafu/sample_by_keys.pig    | 13 +++---
 .../src/test/java/datafu/test/pig/PigTests.java |  2 +-
 .../java/datafu/test/pig/macros/MacroTests.java | 42 ++++++++++++++++++--
 6 files changed, 103 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/count_macros.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/count_macros.pig b/datafu-pig/src/main/resources/datafu/count_macros.pig
index 9bebce4..4b5280b 100644
--- a/datafu-pig/src/main/resources/datafu/count_macros.pig
+++ b/datafu-pig/src/main/resources/datafu/count_macros.pig
@@ -17,11 +17,22 @@
  * under the License.
  */
 
+/**
+ *  Counts all the rows in a relation
+ *
+ *  relation - the relation to count
+ */
 DEFINE count_all_non_distinct(alias) returns res {
   grp_all = GROUP $alias ALL;
   $res = FOREACH grp_all GENERATE COUNT($alias);
 };
 
+/**
+ *  Counts all the distinct keys in a relation
+ *
+ *  relation - the relation to count
+ *  key - the field to check distinctness
+ */
 DEFINE count_distinct_keys(alias, key) returns res {
   just_key = FOREACH $alias GENERATE $key;
   dist_data = DISTINCT just_key;

http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/diff_macros.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/diff_macros.pig b/datafu-pig/src/main/resources/datafu/diff_macros.pig
index cacb1b6..9523fe7 100644
--- a/datafu-pig/src/main/resources/datafu/diff_macros.pig
+++ b/datafu-pig/src/main/resources/datafu/diff_macros.pig
@@ -17,6 +17,14 @@
  * under the License.
  */
 
+/**
+ *  Produces a human-readable description of the rows and fields changed between two relations.
+ *
+ *  diff_macro_old - the old, baseline relation
+ *  diff_macro_new - the new relation to be checked
+ *  diff_macro_pk - the key on which to join/compare individual rows
+ *  diff_macro_ignored_field - an optional field which can be ignored in the comparison,
like a timestamp
+ */
 DEFINE diff_macro(diff_macro_old, diff_macro_new, diff_macro_pk, diff_macro_ignored_field)
returns diffs {
 
 	DEFINE TupleDiff datafu.pig.util.TupleDiff;

http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/left_outer_join.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/left_outer_join.pig b/datafu-pig/src/main/resources/datafu/left_outer_join.pig
new file mode 100644
index 0000000..aeea267
--- /dev/null
+++ b/datafu-pig/src/main/resources/datafu/left_outer_join.pig
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ *  Used to do a left outer join of three relations
+ *
+ *	relation1 - the first relation to join
+ *	key1 - the field from the first relation on which to group
+ *	relation2 - the second relation to join
+ *	key2 - the field from the second relation on which to group
+ *	relation3 - the third relation to join
+ *	key3 - the field from the third relation on which to group
+ *
+ */
+DEFINE left_outer_join(relation1, key1, relation2, key2, relation3, key3) returns joined
{
+  DEFINE EmptyBagToNullFields datafu.pig.bags.EmptyBagToNullFields();
+  
+  cogrouped = COGROUP $relation1 BY $key1, $relation2 BY $key2, $relation3 BY $key3;
+  $joined = FOREACH cogrouped GENERATE
+    FLATTEN($relation1),
+    FLATTEN(EmptyBagToNullFields($relation2)),
+    FLATTEN(EmptyBagToNullFields($relation3));
+};

http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
index c22ffc7..def1b03 100644
--- a/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
+++ b/datafu-pig/src/main/resources/datafu/sample_by_keys.pig
@@ -17,14 +17,13 @@
  * under the License.
  */
 
-/*
- * Macro for sampling a table by a list of keys.
+/**
+ *  Samples a table by a list of keys.
  *
- * Params:
- *   - table_name               - table name to sample
- *   - sample_set               - a set of keys
- *   - join_key_table           - join column name in the table
- *   - join_key_sample          - join column name in the sample
+ *  table_name - table name to sample
+ *  sample_set - a set of keys
+ *  join_key_table - join column name in the table
+ *  join_key_sample - join column name in the sample
  */
 DEFINE sample_by_keys(table, sample_set, join_key_table, join_key_sample) RETURNS out {
     t = FOREACH $table GENERATE

http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/PigTests.java b/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
index b869492..d83ff4f 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/PigTests.java
@@ -235,7 +235,7 @@ public abstract class PigTests
   protected void assertOutput(PigTest test, String alias, String... expected) throws IOException,
ParseException
   {
     List<Tuple> tuples = getLinesForAlias(test, alias);
-    assertEquals(expected.length, tuples.size(), "Mismatch in number of tuples");
+    assertEquals(tuples.size(), expected.length, "Mismatch in number of tuples");
     int i=0;
     for (String e : expected)
     {

http://git-wip-us.apache.org/repos/asf/datafu/blob/e81ea866/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java b/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
index 17d0af5..7a3e7b9 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/macros/MacroTests.java
@@ -38,8 +38,7 @@ public class MacroTests extends PigTests
   STORE cnt INTO 'output';
 
    */
-  @Multiline
-  private String countDistinctTest;
+  @Multiline private static String countDistinctTest;
 
   @Test
   public void countDistinctTest() throws Exception
@@ -74,8 +73,7 @@ public class MacroTests extends PigTests
   STORE cnt INTO 'output';
 
    */
-  @Multiline
-  private String countTest;
+  @Multiline private static String countTest;
 
   @Test
   public void countTest() throws Exception
@@ -99,4 +97,40 @@ public class MacroTests extends PigTests
     assertOutput(test, "cnt", "(31)");
   }
 
+  /**
+
+  import 'datafu/left_outer_join.pig';
+
+  data1 = LOAD 'first' AS (id:chararray, num1:int);
+  data2 = LOAD 'second' AS (id2:chararray, num2:int);
+  data3 = LOAD 'third' AS (id:chararray, num3:int);
+
+  joined = left_outer_join(data1, id, data2, id2, data3, id);
+  STORE joined INTO 'output';
+
+   */
+  @Multiline private static String leftOuterJoinTest;
+
+  @Test
+  public void leftOuterJoinTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(leftOuterJoinTest);
+
+    writeLinesToFile("first","A1\t1","A2\t2","A3\t3","A4\t4","A5\t5","A6\t6");
+
+    writeLinesToFile("second","A1\t11","B2\t12","A3\t13","A4\t14","B5\t15","B6\t16");
+
+    writeLinesToFile("third","A1\t111","A2\t112","A3\t113","B4\t114","A5\t115", "C6\t116");
+
+    test.runScript();
+
+    assertOutput(test, "joined",
+    		"(A1,1,A1,11,A1,111)",
+    		"(A2,2,,,A2,112)",
+    		"(A3,3,A3,13,A3,113)",
+    		"(A4,4,A4,14,,)",
+    		"(A5,5,,,A5,115)",
+    		"(A6,6,,,,)"
+	);
+  }
 }


Mime
View raw message