datafu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mha...@apache.org
Subject [2/3] DATAFU-57 Source tarball generation and gradle bootstrapping
Date Tue, 05 Aug 2014 03:50:46 GMT
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
index d01f9fb..f7caafd 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
@@ -22,12 +22,14 @@ package datafu.pig.hash.lsh.metric;
 import org.apache.commons.math.linear.RealVector;
 /**
  * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
- * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
- * 
- *  It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>}, 
+ * m(v,q) &lt; t.  In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * <p>
+ * It returns one of the tuples of the bag of vectors using <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>, 
  * distance between two vectors.  This is otherwise known as
  * the Euclidean distance.
- * 
+ * </p>
+ *
  * @see datafu.pig.hash.lsh.L2PStableHash L2PStableHash for an example
  * @author cstella
  *
@@ -37,7 +39,7 @@ public class L2 extends MetricUDF {
   /**
    * Create a new L2 Metric UDF with a given dimension.
    * 
-   * @param sDim
+   * @param sDim dimension
    */
   public L2(String sDim) {
     super(sDim);

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
index da00a60..cb6efbb 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
@@ -34,10 +34,12 @@ import datafu.pig.hash.lsh.util.DataTypeUtil;
 
 /**
  * A base UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
- * m(v,q) < t.  In other words, find the first vector in the bag within a threshold distance away.
- * 
- *  It returns one of the tuples of the bag of vectors.  For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
- * 
+ * m(v,q) &lt; t.  In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * <p>
+ * It returns one of the tuples of the bag of vectors.  For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+ * </p>
+ *
  * @see datafu.pig.hash.lsh.CosineDistanceHash
  * @author cstella
  *
@@ -49,7 +51,7 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
   /**
    * Create a new Metric UDF with a given dimension.
    * 
-   * @param sDim
+   * @param sDim dimension
    */
   public MetricUDF(String sDim)
   {
@@ -58,8 +60,8 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
   
   /**
    * The distance metric used.  Given v1 and v2, compute the distance between those vectors.
-   * @param v1 vector
-   * @param v2 vector
+   * @param v1 first vector
+   * @param v2 second vector
    * @return the distance between v1 and v2
    */
   protected abstract double dist(RealVector v1, RealVector v2);
@@ -68,9 +70,11 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
    * This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors.
    * Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value
    * in the case of sparse vectors.
-   * 
+   *
+   * <p>
    * It returns one of the tuples of the bag of vectors.  For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
-   * 
+   * </p>
+   *
    * @see datafu.pig.hash.lsh.CosineDistanceHash
    */
   @Override
@@ -109,10 +113,10 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
     }
     return null;
   }
-  
+
   /**
    * Create the output schema, based on the input schema.
-   * 
+   *
    * @return the output schema, which is a tuple matching the schema of the third input field.
    */
    public Schema outputSchema(Schema input) {
@@ -120,15 +124,14 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
             validateInputSchema(input);
             FieldSchema fieldSchema = input.getField(2);
             return fieldSchema.schema;
-             
-          }catch (Exception e){
+          }catch (Exception e) {
                  throw new RuntimeException("Unable to create output schema", e);
           }
    }
-   
+
    /**
     * Validate the input schema to ensure that our input is consistent and that we fail fast.
-    * @param input
+    * @param input input schema
     * @throws FrontendException
     */
    private void validateInputSchema(Schema input) throws FrontendException
@@ -140,18 +143,18 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
          throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
        }
      }
-     
+
      {
        FieldSchema distanceSchema = input.getField(1);
-       if(distanceSchema.type != DataType.DOUBLE 
-       && distanceSchema.type != DataType.INTEGER 
-       && distanceSchema.type != DataType.LONG 
+       if(distanceSchema.type != DataType.DOUBLE
+       && distanceSchema.type != DataType.INTEGER
+       && distanceSchema.type != DataType.LONG
        )
        {
          throw new FrontendException("Invalid distance element: Expected a number, but found " + distanceSchema);
        }
      }
-     
+
      {
        FieldSchema pointsSchema = input.getField(2);
        if( pointsSchema.type != DataType.BAG)
@@ -166,5 +169,4 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
        }
      }
    }
-   
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
index df8dbc7..ce48880 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
@@ -18,7 +18,7 @@
  */
 
 /**
- * UDFs for different {@link <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a>} (and some similarity functions)
+ * UDFs for different <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a> (and some similarity functions)
  * used with Locality Sensitive Hashing.
  */
 package datafu.pig.hash.lsh.metric;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
index 0f3ba94..21b7306 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
@@ -29,22 +29,25 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
 
 /**
  * This is the base-class for all p-stable based locality sensitive hashes. p-stable locality sensitive
- * hashes are defined by a few parameters: a dimension, d , a vector taken from a 
- * {@link <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>} 
+ * hashes are defined by a few parameters: a dimension, d , a vector taken from a
+ * <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>
  * (where k is 1 or 2) and a width of projection, w.
+ *
  * <p>
  * All p-stable LSH functions are parameterized with a quantization parameter (w or r in
  * the literature , depending on where you look). Consider the following excerpt
  * from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004).
  * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
  * Proceedings of the Symposium on Computational Geometry.
- * 
- * <pre>
- * Decreasing the width of the projection (w) decreases the probability of collision for any two points. 
+ * </p>
+ *
+ * <p>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
  * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
  * and in this way decrease the number of projections we need to make.
- * </pre>
- * 
+ * </p>
+ *
+ * <p>
  * In the literature, the quantization parameter (or width of the projection) is
  * found empirically given a sample of the data and the likely threshold for
  * the metric. Tuning this parameter is very important for the performance of
@@ -52,24 +55,23 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
  * P.; Mirrokni, V.S. (2004).
  * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
  * Proceedings of the Symposium on Computational Geometry.
- * 
+ * </p>
+ *
  * @author cstella
- * 
+ *
  */
 public abstract class AbstractStableDistributionFunction extends LSH
 {
-  
    private double[] a;
    private double b;
    double w;
 
- 
    /**
     * Constructs a new instance.
     * @param dim The dimension of the vectors to be hashed
     * @param w A double representing the quantization parameter (also known as the projection width)
-    * @param rand The random generator used 
-    * @throws MathException 
+    * @param rand The random generator used
+    * @throws MathException MathException
     */
    public AbstractStableDistributionFunction(int dim, double w, RandomGenerator rand) throws MathException
    {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
index 79bf7e5..5ac2bfc 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
@@ -28,7 +28,7 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
 /**
  * A locality sensitive hash associated with the L1 metric.  This uses a 1-stable distribution
  * to construct the hash.
- * 
+ *
  * @author cstella
  *
  */
@@ -36,25 +36,29 @@ public class L1LSH extends AbstractStableDistributionFunction implements Sampler
 {
   /**
    * Constructs a new instance.
-   * @throws MathException 
+   *
+   * @param dim The dimension of the vectors to be hashed
+   * @param w A double representing the quantization parameter (also known as the projection width)
+   * @param rand The random generator used
+   * @throws MathException MathException
    */
-  public L1LSH(int dim, double d, RandomGenerator rand) throws MathException {
-    super(dim, d, rand);
+  public L1LSH(int dim, double w, RandomGenerator rand) throws MathException {
+    super(dim, w, rand);
   }
 
   /**
    * Draw a sample s ~ Cauchy(0,1), which is 1-stable.
-   * 
+   *
+   * @param randomData random data generator
    * @return a sample from a cauchy distribution with median 0 and scale 1
+   * @throws MathException MathException
    */
   public double sample(RandomDataImpl randomData) throws MathException {
-    
     return randomData.nextCauchy(0, 1);
-    
   }
+
   @Override
   protected Sampler getSampler() {
     return this;
   }
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
index d18b189..95a487a 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
@@ -28,15 +28,17 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
 /**
  * A locality sensitive hash associated with the L2 metric.  This uses a 2-stable distribution
  * to construct the hash.
- * 
- * @author cstella
  *
+ * @author cstella
  */
 public class L2LSH extends AbstractStableDistributionFunction implements Sampler {
 
   /**
    * Constructs a new instance.
-   * @throws MathException 
+   * @param dim the dimension of the vectors to be hashed
+   * @param w a double representing the quantization parameter (also known as the projection width)
+   * @param rand the random generator
+   * @throws MathException MathException
    */
   public L2LSH(int dim, double w, RandomGenerator rand) throws MathException {
     super(dim, w, rand);
@@ -44,7 +46,8 @@ public class L2LSH extends AbstractStableDistributionFunction implements Sampler
 
   /**
    * Draw a sample s ~ Gaussian(0,1), which is 2-stable.
-   * 
+   *
+   * @param randomData random data generator
    * @return a sample from a Gaussian distribution with mu of 0 and sigma of 1
    */
    public double sample(RandomDataImpl randomData)

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
index ec9c313..7d7bb65 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
@@ -18,8 +18,8 @@
  */
 
 /**
- * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} for 
- * {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>}.
+ * Implementation of <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a> for 
+ * <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>.
  * 
  * See Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
  * 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
index 045ed0d..912d72e 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
@@ -18,6 +18,6 @@
  */
 
 /**
- * UDFs for {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}
+ * UDFs for <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>.
  */
 package datafu.pig.hash.lsh;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
index da30179..6f09941 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
@@ -40,15 +40,17 @@ public enum DataTypeUtil {
   /**
    * Convert a tuple t into a RealVector of dimension dim.
    * The tuple can be of a couple of forms:
+   *
    * <ul>
    * <li>A tuple composed of dim numeric types a la (1.0,2.0,3,5.0)</li>
    * <li>A tuple which contains as its first element a tuple like above a la ( (1.0,2.0,3,5.0), 5) ) would yield (1.0,2.0,3,5.0)</li>
    * <li>A bag containing tuples where the first element is the position and the second element is the value.  This is for sparse vectors and it looks like this ( { (0,1.0), (1, 2.0), (3,3), (4,5.0) } ).</li>
    * </ul>
+   *
    * @param t The tuple to convert to a vector
    * @param dim The dimension of the vector
    * @return The actual RealVector (which may or may not be sparse)
-   * @throws PigException
+   * @throws PigException PigException
    */
   public RealVector convert(Tuple t, int dim) throws PigException
   {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
index 80ff567..423f890 100644
--- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
+++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
@@ -39,39 +39,39 @@ import org.apache.pig.impl.logicalLayer.FrontendException;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 /**
- * A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}.
- * 
- * <p>  
- * This is not a distributed implementation.  Each graph is stored in memory while running the algorithm, with edges optionally 
+ * A UDF which implements <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>.
+ *
+ * <p>
+ * This is not a distributed implementation.  Each graph is stored in memory while running the algorithm, with edges optionally
  * spilled to disk to conserve memory.  This can be used to distribute the execution of PageRank on multiple
  * reasonably sized graphs.  It does not distribute execuion of PageRank for each individual graph.  Each graph is identified
  * by an integer valued topic ID.
  * </p>
- * 
+ *
  * <p>
  * If the graph is too large to fit in memory than an alternative method must be used, such as an iterative approach which runs
  * many MapReduce jobs in a sequence to complete the PageRank iterations.
  * </p>
- * 
+ *
  * <p>
  * Each graph is represented through a bag of (source,edges) tuples.  The 'source' is an integer ID representing the source node.
  * The 'edges' are the outgoing edges from the source node, represented as a bag of (dest,weight) tuples.  The 'dest' is an
  * integer ID representing the destination node.  The weight is a double representing how much the edge should be weighted.
  * For a standard PageRank implementation just use weight of 1.0.
  * </p>
- * 
+ *
  * <p>
  * The output of the UDF is a bag of (source,rank) pairs, where 'rank' is the PageRank value for that source in the graph.
  * </p>
- * 
+ *
  * <p>
  * There are several configurable options for this UDF, among them:
- * <p>
- * 
+ * </p>
+ *
  * <ul>
  * <li>
  * <b>alpha</b>: Controls the PageRank alpha value.  The default is 0.85.  A higher value reduces the "random jump"
- * factor and causes the rank to be influenced more by edges. 
+ * factor and causes the rank to be influenced more by edges.
  * </li>
  * <li>
  * <b>max_iters</b>: The maximum number of iterations to run.  The default is 150.
@@ -83,8 +83,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * <li>
  * <b>tolerance</b>: A threshold which causes iterations to cease.  It is measured from the total change in ranks from each of
  * the nodes in the graph.  As the ranks settle on their final values the total change decreases.  This can be used
- * to stop iterations early.  The default is 1e-16. 
- * </li> 
+ * to stop iterations early.  The default is 1e-16.
+ * </li>
  * <li>
  * <b>max_nodes_and_edges</b>: This is a control to prevent running out of memory.  As a graph is loaded, if the sum of edges
  * and nodes exceeds this value then it will stop.  It will not fail but PageRank will not be run on this graph.  Instead a null
@@ -92,52 +92,48 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * </li>
  * <li>
  * <b>spill_to_edge_disk_storage</b>: Used to conserve memory.  When "true" it causes the edge data to be written to disk in a temp file instead
- * of being held in memory when the number of edges exceeds a threshold.  The nodes are still held in memory however.  
+ * of being held in memory when the number of edges exceeds a threshold.  The nodes are still held in memory however.
  * Each iteration of PageRank will stream through the edges stored on disk.  The default is "false".
  * </li>
  * <li>
  * <b>max_edges_in_memory</b>: When spilling edges to disk is enabled, this is the threshold which triggers that behavior.  The default is 30M.
  * </li>
  * </ul>
- * 
+ *
  * <p>
  * Parameters are configured by passing them in as a sequence of pairs into the UDF constructor.  For example, below the alpha value is set to
  * 0.87 and dangling nodes are enabled.  All arguments must be strings.
  * </p>
- * 
- * <p>
+ *
  * <pre>
  * {@code
  * define PageRank datafu.pig.linkanalysis.PageRank('alpha','0.87','dangling_nodes','true');
  * }
  * </pre>
- * </p>
- * 
- * <p>
+ *
  * Full example:
  * <pre>
  * {@code
- * 
+ *
  * topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE);
- * 
+ *
  * topic_edges_grouped = GROUP topic_edges by (topic, source) ;
  * topic_edges_grouped = FOREACH topic_edges_grouped GENERATE
  *    group.topic as topic,
  *    group.source as source,
  *    topic_edges.(dest,weight) as edges;
- * 
- * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic; 
- * 
+ *
+ * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic;
+ *
  * topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE
  *    group as topic,
  *    FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank);
  *
  * topic_ranks = FOREACH topic_ranks GENERATE
  *    topic, source, rank;
- * 
+ *
  * }
  * </pre>
- * </p> 
  */
 public class PageRank extends AccumulatorEvalFunc<DataBag>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
index 5d0b932..7994b0e 100644
--- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
+++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
@@ -39,9 +39,8 @@ import java.util.Map;
 import com.google.common.collect.AbstractIterator;
 
 /**
- * An implementation of {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}, used by the {@link PageRank} UDF.
- * It is not intended to be used directly.   
- * </p>
+ * An implementation of <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>, used by the {@link PageRank} UDF.
+ * It is not intended to be used directly.
  */
 public class PageRankImpl
 {    
@@ -104,7 +103,7 @@ public class PageRankImpl
    
    /**
     * Sets the page rank alpha value (default is 0.85);
-    * @param alpha 
+    * @param alpha page rank alpha value
     */
    public void setAlpha(float alpha)
    {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
index de89c4a..ec8375a 100644
--- a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
+++ b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
@@ -35,10 +35,12 @@ import datafu.pig.util.SimpleEvalFunc;
 public class RandInt extends SimpleEvalFunc<Integer> 
 {
   private final Random rand = new Random();
-  
+
   /**
    * @param min lower bound for random number
    * @param max upper bound for random number
+   * @return random integer between min and max
+   * @throws IOException IOException
    */
   public Integer call(Integer min, Integer max) throws IOException
   {
@@ -60,6 +62,5 @@ public class RandInt extends SimpleEvalFunc<Integer>
   {
     return new Schema(new Schema.FieldSchema("rand", DataType.INTEGER));
   }
-  
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
index 90ea576..d94a038 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
@@ -30,24 +30,24 @@ import org.apache.pig.data.Tuple;
  * This is essentially equivalent to grouping on the fields, applying SAMPLE,
  * and then flattening.  It is much more efficient though because it does not require
  * a reduce step.
- * 
+ *
  * <p>
  * The method of sampling is to convert the key to a hash, derive a double value
  * from this, and then test this against a supplied probability.  The double value
  * derived from a key is uniformly distributed between 0 and 1.
  * </p>
- * 
+ *
  * <p>
  * The only required parameter is the sampling probability.  This may be followed
  * by an optional seed value to control the random number generation.  
  * </p>
- * 
+ *
  * <p>
  * SampleByKey will work deterministically as long as the same seed is provided.  
  * </p>
- * 
- * <p>
+ *
  * Example:
+ *
  * <pre>
  * {@code
  * DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5');
@@ -58,12 +58,10 @@ import org.apache.pig.data.Tuple;
  * output = FILTER data BY SampleByKey(A_id);
  * 
  * --output: (B,1), (B,3)
- * } 
- * 
+ * }
  * </pre>
- * </p>
- * @author evion 
- * 
+ *
+ * @author evion
  */
 
 public class SampleByKey extends FilterFunc

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
index 8e8debf..ee2e796 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
@@ -35,57 +35,69 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 /**
  * Scalable simple random sampling (ScaSRS).
- * <p/>
+ *
+ * <p>
  * This UDF implements a scalable simple random sampling algorithm described in
- * 
+ * </p>
+ *
  * <pre>
  * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013.
  * </pre>
- * 
+ *
+ * <p>
  * It takes a bag of n items and a sampling probability p as the inputs, and outputs a
  * simple random sample of size exactly ceil(p*n) in a bag, with probability at least
  * 99.99%. For example, the following script generates a simple random sample with
  * sampling probability 0.1:
- * 
+ * </p>
+ *
  * <pre>
  * DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- * 
- * item    = LOAD 'input' AS (x:double); 
+ *
+ * item    = LOAD 'input' AS (x:double);
  * sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01));
  * </pre>
- * 
+ *
+ * <p>
  * Optionally, user can provide a good lower bound of n as the third argument to help
  * reduce the size of intermediate data, for example:
- * 
+ * </p>
+ *
  * <pre>
  * DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- * 
- * item    = LOAD 'input' AS (x:double); 
+ *
+ * item    = LOAD 'input' AS (x:double);
  * summary = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count;
  * sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01, summary.count));
  * </pre>
- * 
+ *
+ * <p>
  * This UDF is very useful for stratified sampling. For example, the following script
  * keeps all positive examples while downsampling negatives with probability 0.1:
- * 
+ * </p>
+ *
  * <pre>
  * DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- * 
+ *
  * item    = LOAD 'input' AS (x:double, label:int);
- * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p; 
+ * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p;
  * sampled = FOREACH grouped GENERATE FLATTEN(SRS(item, p));
  * </pre>
- * 
+ *
+ * <p>
  * In a Java Hadoop MapReduce job, we can output selected items directly using
  * MultipleOutputs. However, this feature is not available in a Pig UDF. So we still let
  * selected items go through the sort phase. However, as long as the sample size is not
  * huge, this should not be a big problem.
- * 
- * In the first version, the sampling probability is specified in the constructor. This 
+ * </p>
+ *
+ * <p>
+ * In the first version, the sampling probability is specified in the constructor. This
  * method is deprecated now and will be removed in the next release.
- * 
+ * </p>
+ *
  * @author ximeng
- * 
+ *
  */
 public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag>
 {
@@ -104,7 +116,8 @@ public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag>
 
   /**
    * Constructs this UDF with a sampling probability.
-   * 
+   *
+   * @param samplingProbability sampling probability
    * @deprecated Should specify the sampling probability in the function call.
    */
   @Deprecated

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
index 598e58c..0ae5950 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
@@ -40,11 +40,14 @@ import com.google.common.primitives.Ints;
 
 /**
  * Scalable simple random sampling with replacement (ScaSRSWR).
- * <p/>
+ *
+ * <p>
  * This UDF together with {@link SimpleRandomSampleWithReplacementElect} implement a
  * scalable algorithm for simple random sampling with replacement (SRSWR), which is a
  * randomized algorithm with a failure rate less than {@value #FAILURE_RATE}.
- * <p/>
+ * </p>
+ *
+ * <p>
  * Let s be the desired sample size. To compute an SRSWR sample of size s, for each output
  * position in {0, 1, ..., s-1}, we want to select an item from the population uniformly
  * at random. This algorithm consists of two stages: vote and election. In the vote stage,
@@ -52,7 +55,9 @@ import com.google.common.primitives.Ints;
  * for each position. In the election stage, the paired UDF
  * {@link SimpleRandomSampleWithReplacementElect} elects one candidate for each position.
  * The algorithm succeeds if we have at least one candidate for each position.
- * <p/>
+ * </p>
+ *
+ * <p>
  * To use this UDF pair, user needs to provide: 1) the desired sample size, 2) a good
  * lower bound of the population size or the exact size. The input to the vote UDF
  * {@link SimpleRandomSampleWithReplacementVote} is a tuple that consists of a bag of
@@ -62,47 +67,59 @@ import com.google.common.primitives.Ints;
  * elect UDF {@link SimpleRandomSampleWithReplacementElect} is a tuple that contains all
  * candidates voted by the vote UDF for some positions. The output from the elect UDF is a
  * bag of sampled items.
- * <p/>
+ * </p>
+ *
+ * <p>
  * For example, the following script generates a sample of size 100000 with replacement:
- * 
+ * </p>
+ *
  * <pre>
  * DEFINE SRSWR_VOTE  datafu.pig.sampling.SimpleRandomSampleWithReplacementVote();
  * DEFINE SRSWR_ELECT datafu.pig.sampling.SimpleRandomSampleWithReplacementElect();
- * 
- * item       = LOAD 'input' AS (x:double); 
+ *
+ * item       = LOAD 'input' AS (x:double);
  * summary    = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count;
  * candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), 100000, summary.count));
  * sampled    = FOREACH (GROUP candidates BY position PARALLEL 10) GENERATE FLATTEN(SRSWR_ELECT(candidates));
  * </pre>
- * 
+ *
+ * <p>
  * Because for election we only need to group candidates voted for the same position, this
  * algorithm can use many reducers to consume the candidates. See the "PARALLEL 10"
  * statement above. If the item to sample is the entire row, use TOBAG(TOTUPLE(*)).
- * <p/>
+ * </p>
+ *
+ * <p>
  * SRSWR is heavily used in bootstrapping. Bootstrapping can be done easily with this UDF
  * pair. For example, the following script generates 100 bootstrap samples, computes the
  * mean value for each sample, and then outputs the bootstrap estimates.
- * 
+ * </p>
+ *
  * <pre>
  * summary    = FOREACH (GROUP item ALL) GENERATE AVG(item.x) AS mean, COUNT(item) AS count;
  * candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), summary.count*100, summary.count));
  * sampled    = FOREACH (GROUP candidates BY (position % 100) PARALLEL 10) GENERATE AVG(SRSWR_ELECT(candidates)) AS mean;
  * bootstrap  = FOREACH (GROUP sampled ALL) GENERATE summary.mean AS mean, sampled.mean AS bootstrapMeans;
  * </pre>
- * 
+ *
+ * <p>
  * Another usage of this UDF pair is to generate random pairs or tuples without computing
  * the cross product, where each pair or tuple consist of items from different input
  * sources. Let s be the number of random tuples we want to generate. For each input
  * source, simply use the vote UDF to propose candidates, then join the candidates from
  * different sources by their positions and for each position use the elect UDF to select
  * one candidate from each source to form the pair or tuple for that position.
- * <p/>
+ * </p>
+ *
+ * <p>
  * The algorithm is a simple extension to the work
- * 
+ * </p>
+ *
  * <pre>
  * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013.
  * </pre>
- * 
+ *
+ * <p>
  * Basically, for each output position, it performs a random sort on the population
  * (associates each item with a random score independently drawn from the uniform
  * distribution and then sorts items based on the scores), and picks the one that has the
@@ -110,34 +127,41 @@ import com.google.common.primitives.Ints;
  * population. For example, if the population size is one billion and the random score
  * generated for an item is 0.9, very likely it won't become the smallest and hence we do
  * not need to propose it as a candidate.
- * <p/>
+ * </p>
+ *
+ * <p>
  * More precisely, let n be the population size, n1 be a good lower bound of n, s be the
  * sample size, delta be the failure rate, and q be the threshold. For each output
  * position the probability of all random scores being greater than q is (1-q)^n. Thus, if
  * we throw away items with associated scores greater than q, with probability at least 1
  * - s*(1-q)^n, we can still capture the item with the smallest score for each position.
  * Fix delta = s*(1-q)^n and solve for q, we get q = 1-exp(log(delta/s)/n), Note that
- * replacing n by n1 < n can only decrease the failure rate, though at the cost of
+ * replacing n by n1 &lt; n can only decrease the failure rate, though at the cost of
  * increased number of candidates. The expected number of candidates is (1 -
  * exp(log(delta/s)/n1)*s*n. When n1 equals n, this number is approximately
  * s*log(s/delta).
- * <p/>
+ * </p>
+ *
+ * <p>
  * Generating a random score for each (item, position) pair is very expensive and
  * unnecessary. For each item, the number of positions for which it gets voted follows a
  * binomial distribution B(s,q). We can simply draw a number from this distribution,
  * determine the positions by sampling without replacement, and then generate random
  * scores for those positions. This reduces the running time significantly.
- * <p/>
+ * </p>
+ *
+ * <p>
  * Since for each position we only need the candidate with the smallest score, we
  * implement a combiner to reduce the size of intermediate data in the elect UDF
+ * </p>
+ *
  * {@link SimpleRandomSampleWithReplacementElect}.
- * 
+ *
  * @see SimpleRandomSampleWithReplacementElect
- * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics) target="_blank
- *      ">Boostrapping (Wikipedia)</a>
- * 
+ * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics)" target="_blank">Boostrapping (Wikipedia)</a>
+ *
  * @author ximeng
- * 
+ *
  */
 public class SimpleRandomSampleWithReplacementVote extends EvalFunc<DataBag>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
index 92af6a3..a5d265c 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
@@ -29,28 +29,27 @@ import org.apache.pig.builtin.Nondeterministic;
 import org.apache.pig.backend.executionengine.ExecException;
 
 /**
- * <p>
  * Performs a weighted random sample using an in-memory reservoir to produce
  * a weighted random sample of a given size based on the A-Res algorithm described in 
- * {@link <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>}. 
- * </p>
+ * <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>.
+ *
  * <p>
  * Species with larger weight have higher probability to be selected in the final sample set.
  * </p>
+ *
  * <p>
  * This UDF inherits from {@link ReservoirSample} and it is guaranteed to produce
  * a sample of the given size.  Similarly it comes at the cost of scalability.
  * since it uses internal storage with size equaling the desired sample to guarantee the exact sample size.
  * </p>
- * <p>
- * Its constructor takes 2 arguments. 
+ *
+ * Its constructor takes 2 arguments:
  * <ul>
  *     <li>The 1st argument specifies the sample size which should be a string of positive integer.
  *     <li>The 2nd argument specifies the index of the weight field in the input tuple, 
  *     which should be a string of non-negative integer that is no greater than the input tuple size. 
  * </ul>
- * </p>
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
@@ -60,7 +59,6 @@ import org.apache.pig.backend.executionengine.ExecException;
  * sampled = FOREACH input_g GENERATE WeightedSample(input);
  * }
  * </pre>
- * </p>
  * @author wjian
  */
 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
index 52d159b..d81fb48 100644
--- a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
+++ b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
@@ -47,13 +47,12 @@ import org.joda.time.Period;
  * session_id, that is a GUID indicating the session of the request.
  * </p>
  *
- * <p>
  * Example:
  * <pre>
  * {@code
- * 
+ *
  * %declare TIME_WINDOW  30m
- * 
+ *
  * define Sessionize datafu.pig.sessions.Sessionize('$TIME_WINDOW');
  *
  * views = LOAD 'views.tsv' AS (visit_date:chararray, member_id:int, url:chararray);
@@ -62,7 +61,7 @@ import org.joda.time.Period;
  * views = GROUP views BY member_id;
  * sessions = FOREACH views {
  *   visits = ORDER views BY visit_date;
- *   GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id); 
+ *   GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id);
  * }
  *
  * -- count the number of sessions hitting the url
@@ -70,7 +69,6 @@ import org.joda.time.Period;
  * result = FOREACH rollup GENERATE group AS url, COUNT(SESSIONS) AS session_cnt;
  * }
  * </pre>
- * </p>
  */
 @Nondeterministic
 public class Sessionize extends AccumulatorEvalFunc<DataBag>

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
index c9997f8..90faa1e 100644
--- a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
+++ b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
@@ -26,9 +26,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 /**
  * Base class for set operations.
- * 
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
  */
 public abstract class SetOperationsBase extends EvalFunc<DataBag>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Median.java b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
index e33a84e..5f9d18d 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/Median.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
@@ -16,18 +16,18 @@
  * specific language governing permissions and limitations
  * under the License.
  */
- 
+
 package datafu.pig.stats;
 
 /**
- * Computes the {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>} 
+ * Computes the <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>
  * for a <b>sorted</b> input bag, using type R-2 estimation.  This is a convenience wrapper around Quantile.
  *
  * <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is 
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
  * done (e.g., group by 'day') if the data is too large.  That is, this isn't distributed median.
  * </p>
- * 
+ *
  * @see Quantile
  */
 public class Median extends Quantile

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
index 6fd42d3..89621ea 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
@@ -35,7 +35,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 import datafu.pig.util.SimpleEvalFunc;
 
 /**
- * Computes {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} 
+ * Computes <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>
  * for a <b>sorted</b> input bag, using type R-2 estimation.
  *
  * <p>
@@ -74,7 +74,6 @@ import datafu.pig.util.SimpleEvalFunc;
  *   <li>Quantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles
  * </ul>
  * 
- * <p>
  * Example:
  * <pre>
  * {@code
@@ -91,7 +90,7 @@ import datafu.pig.util.SimpleEvalFunc;
  *   sorted = ORDER input BY val;
  *   GENERATE Quantile(sorted);
  * }
- * }</pre></p>
+ * }</pre>
  *
  * @see Median
  * @see StreamingQuantile

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
index c6fd36a..e7ba3c9 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
@@ -23,9 +23,6 @@ import java.util.ArrayList;
 
 /**
  * Methods used by {@link Quantile}.
- * 
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
  */
 public class QuantileUtil
 { 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
index c4c3be4..28d887d 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
@@ -20,15 +20,15 @@
 package datafu.pig.stats;
 
 /**
- * Computes the approximate {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>} 
- * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.  
+ * Computes the approximate <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>
+ * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
  * This is a convenience wrapper around StreamingQuantile.
  *
  * <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is 
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
  * done (e.g., group by 'day') if the data is too large.  That is, this isn't distributed median.
  * </p>
- * 
+ *
  * @see StreamingQuantile
  */
 public class StreamingMedian extends StreamingQuantile

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
index e4a65b4..2e36941 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
@@ -35,21 +35,21 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 
 /**
- * Computes approximate {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} 
+ * Computes approximate <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>
  * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
  * 
  * <p>
  * The algorithm is described here:
- * {@link <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>}
+ * <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>
  * </p>
  * 
  * <p>
  * The implementation is based on the one in Sawzall, available here:
- * {@link <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>}
+ * <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>
  * </p>
  * 
  * <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is 
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
  * done (e.g., group by 'day') if the data is too large.  That is, this isn't distributed quantiles.
  * </p>
  * 
@@ -95,12 +95,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
  *       GCD of 0.2, 0.7, and 1.0.</li>
  *   <li>If 0.999 is requested the quantiles 0.0, 0.001, 0.002, ... , 0.998, 0.999, 1.0 are computed because 0.001 is
  *       the GCD of 0.999 and 1.0.</li> 
- *  </p>  
  * </ul>
- * 
+ *
  * <p>The error on the approximation goes down as the number of buckets computed goes up.</p>
- * 
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
@@ -115,7 +113,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
  * -- produces: (1.0,3.0,5.0,8.0,10.0)
  * quantiles = FOREACH grouped generate Quantile(input);
  * }
- * </pre></p>
+ * </pre>
  *
  * @see StreamingMedian
  * @see Quantile

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
index 6f22f25..dd18c56 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
@@ -40,25 +40,23 @@ import org.apache.pig.backend.executionengine.ExecException;
 
 
 /**
-* Generates the {@link <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a>} 
+* Generates the <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a>
 * of a set of Values. This UDF uses the fact that variance(x) = average(x^2) - average(x)^2
 * This class implements * {@link org.apache.pig.Algebraic}, so if possible the execution will performed in a distributed fashion.
 * VAR implements the {@link org.apache.pig.Accumulator} interface as well.
-* 
+*
 * Input: Bag of int, long, double, float or bytearray
 * Output: Double
-* 
-* <p>
+*
 * Example:
 * <pre>
 * define VAR datafu.pig.stats.VAR();
-* 
+*
 * -- input: 1,2,3,4,10,5,6,7,8,9
 * input = LOAD 'input' AS (val:int);
 * grouped = GROUP input ALL;
 * variance = FOREACH grouped GENERATE VAR(input.val) AS variance;
 * </pre>
-* </p>
 */
 public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Double> {
     private static TupleFactory mTupleFactory = TupleFactory.getInstance();
@@ -68,7 +66,7 @@ public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Doub
         try {
             Double sum = sum(input);
             Double sumSquare = sumSquare(input);
-            
+
             if(sum == null) {
                 // either we were handed an empty bag or a bag
                 // filled with nulls - return null in this case

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
index 1448611..85a8df7 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
@@ -35,13 +35,15 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 import datafu.pig.util.SimpleEvalFunc;
 
 /**
- * Computes the {@link <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>}
+ * Computes the <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>.
+ *
  * <p>
  * Constructor requires the confidence interval (alpha) parameter, and the
  * parameters are the number of positive (success) outcomes and the total
  * number of observations. The UDF returns the (lower,upper) confidence
- * interval. 
- * <p>
+ * interval.
+ * </p>
+ *
  * Example:
  * <pre>
  * {@code
@@ -54,7 +56,7 @@ import datafu.pig.util.SimpleEvalFunc;
  * quux = ORDER bar BY score DESC;
  * top = LIMIT quux 10;
  * }
- * </pre></p>
+ * </pre>
  */
 public class WilsonBinConf extends SimpleEvalFunc<Tuple>
 {
@@ -82,6 +84,7 @@ public class WilsonBinConf extends SimpleEvalFunc<Tuple>
    * @param x The number of positive (success) outcomes
    * @param n The number of observations
    * @return The (lower,upper) confidence interval
+   * @throws IOException IOException
    */
   public Tuple binconf(Long x, Long n) throws IOException
   {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
index 26b743e..2f0f148 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
@@ -34,34 +34,38 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 
 /**
- * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's 
- * {@link <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>}, 
+ * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's
+ * <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>,
  * X is the conditional variable and Y is the variable that conditions on X.
+ *
  * <p>
  * Each tuple of the input bag has 2 fields, the 1st field is an object instance of variable X and
  * the 2nd field is an object instance of variable Y. An exception will be thrown if the number of fields is not 2.
- * </p> 
+ * </p>
+ *
  * <p>
  * This UDF's constructor definition and parameters are the same as that of {@link datafu.pig.stats.entropy.Entropy}
  * </p>
- * <p>
+ *
  * Note:
  * <ul>
  *     <li>The input bag to this UDF must be <b>sorted</b> on X and Y, with X in the first sort order.
  *     An exception will be thrown if the input bag is not sorted.
  *     <li>The returned entropy value is of double type.
  * </ul>
- * </p>
+ *
  * <p>
- * How to use: 
+ * How to use:
  * </p>
+ *
  * <p>
  * This UDF calculates conditional entropy given raw data tuples of X and Y without the need to pre-compute per tuple occurrence frequency.
  * </p>
+ *
  * <p>
  * It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input.
  * </p>
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
@@ -79,21 +83,20 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * }
  * }
  * </pre>
- * </p>
+ *
  * Use case to calculate mutual information:
- * <p>
  * <pre>
  * {@code
  * ------------
  * -- calculate mutual information I(X, Y) using conditional entropy UDF and entropy UDF
  * -- I(X, Y) = H(Y) - H(Y|X)
  * ------------
- * 
+ *
  * define CondEntropy datafu.pig.stats.entropy.CondEntropy();
  * define Entropy datafu.pig.stats.entropy.Entropy();
- * 
+ *
  * input = LOAD 'input' AS (grp: chararray, valX: double, valY: double);
- * 
+ *
  * -- calculate the I(X,Y) in each group
  * input_group_g = GROUP input BY grp;
  * mutual_information = FOREACH input_group_g {
@@ -107,7 +110,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * }
  * }
  * </pre>
- * </p>
  * @see Entropy
  */
 public class CondEntropy extends AccumulatorEvalFunc<Double> {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
index 388b80f..1a6b846 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
@@ -41,37 +41,40 @@ import datafu.pig.stats.entropy.EntropyUtil;
 
 /**
  * Calculate the empirical entropy of random variable X given its occurrence frequencies, following entropy's
- * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>}
+ * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>.
+ *
  * <p>
  * This UDF's constructor takes 1 argument: the logarithm base, whose definition is the same as that defined in {@link datafu.pig.stats.entropy.Entropy}
  * </p>
- * <p>
- * Note: 
+ *
+ * Note:
  * <ul>
  *     <li>Unlike {@link datafu.pig.stats.entropy.Entropy}, which calculates entropy from sorted raw data bag in accumulative mode,
- *     this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode.
+ *     this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode.</li>
  *     <li>Each tuple of the UDF's input bag <b>must only</b> have 1 field, the occurrence frequency of a data instance,
- *     and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown.
- *     <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file.
- *     <li>The returned entropy value is of double type.
+ *     and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown.</li>
+ *     <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file.</li>
+ *     <li>The returned entropy value is of double type.</li>
  * </ul>
- * </p>
+ *
  * <p>
- * How to use: 
+ * How to use:
  * </p>
+ *
  * <p>
  * To use this UDF, customer needs to pre-compute the occurrence frequency of each data instance, often in an outer GROUP BY
  * , and then use this UDF to calculate entropy with those frequency numbers in another outer GROUP BY.
  * </p>
+ *
  * <p>
  * Compared with {@link datafu.pig.stats.entropy.Entropy}, this UDF is more scalable when we need to handle a very large data set, 
  * since it could distribute computation onto mappers and take advantage of combiners to reduce intermedidate output from mappers to reducers.
  * </p>
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
- * 
+ *
  * define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
  *
  * input = LOAD 'input' AS (val: double);
@@ -79,48 +82,48 @@ import datafu.pig.stats.entropy.EntropyUtil;
  * -- calculate the occurrence of each instance
  * counts_g = GROUP input BY val;
  * counts = FOREACh counts_g GENERATE COUNT(input) AS cnt;
- * 
- * -- calculate entropy 
+ *
+ * -- calculate entropy
  * input_counts_g = GROUP counts ALL;
  * entropy = FOREACH input_counts_g GENERATE Entropy(counts) AS entropy;
  * }
  * </pre>
- * </p>
+ *
  * Use case to calculate mutual information using EmpiricalCountEntropy:
- * <p>
+ *
  * <pre>
  * {@code
- * 
+ *
  * define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
- * 
+ *
  * input = LOAD 'input' AS (valX: double, valY: double);
- * 
+ *
  * ------------
  * -- calculate mutual information I(X, Y) using entropy
  * -- I(X, Y) = H(X) + H(Y) -  H(X, Y)
  * ------------
- * 
+ *
  * input_x_y_g = GROUP input BY (valX, valY);
  * input_x_y_cnt = FOREACH input_x_y_g GENERATE flatten(group) as (valX, valY), COUNT(input) AS cnt;
- * 
+ *
  * input_x_g = GROUP input_x_y_cnt BY valX;
  * input_x_cnt = FOREACH input_x_g GENERATE flatten(group) as valX, SUM(input_x_y_cnt.cnt) AS cnt;
- * 
+ *
  * input_y_g = GROUP input_x_y_cnt BY valY;
  * input_y_cnt = FOREACH input_y_g GENERATE flatten(group) as valY, SUM(input_x_y_cnt.cnt) AS cnt;
- * 
+ *
  * input_x_y_entropy_g = GROUP input_x_y_cnt ALL;
  * input_x_y_entropy = FOREACH input_x_y_entropy_g {
  *                         input_x_y_entropy_cnt = input_x_y_cnt.cnt;
  *                         GENERATE Entropy(input_x_y_entropy_cnt) AS x_y_entropy;
  *                     }
- *                         
+ *
  * input_x_entropy_g = GROUP input_x_cnt ALL;
  * input_x_entropy = FOREACH input_x_entropy_g {
  *                         input_x_entropy_cnt = input_x_cnt.cnt;
  *                         GENERATE Entropy(input_x_entropy_cnt) AS x_entropy;
  *                   }
- *                       
+ *
  * input_y_entropy_g = GROUP input_y_cnt ALL;
  * input_y_entropy = FOREACH input_y_entropy_g {
  *                         input_y_entropy_cnt = input_y_cnt.cnt;
@@ -133,7 +136,6 @@ import datafu.pig.stats.entropy.EntropyUtil;
  *                                             input_x_y_entropy::x_y_entropy) AS mi;
  * }
  * </pre>
- * </p>
  * @see Entropy
  */
 

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
index 9dfff1a..efa1d35 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
@@ -32,53 +32,57 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 
 /**
- * Calculate entropy H(X) of random variable X following entropy's 
- * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>}
+ * Calculate entropy H(X) of random variable X following entropy's
+ * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>
+ *
  * <p>
  * This UDF's constructor takes 2 arguments. 
  * </p>
- * <p>
+ *
  * The 1st argument, the type of entropy estimator algorithm we currently support, includes:
  * <ul>
  *     <li>empirical (empirical entropy estimator)
  *     <li>chaosh (Chao-Shen entropy estimator) 
  * </ul>
- * </p>
+ *
  * <p>
  * The default estimation algorithm is empirical.
  * </p>
+ *
  * <p>
  * The 2nd argument, the logarithm base we currently support, includes:
  * </p>
- * <p>
+ *
  * <ul>
  *     <li>log (use Euler's number as the logarithm base)
  *     <li>log2 (use 2 as the logarithm base)
  *     <li>log10 (use 10 as the logarithm base) 
  * </ul>
- * </p>
+ *
  * <p>
  * The default logarithm base is log.
  * </p> 
- * <p>
+ *
  * Note:
  * <ul>
  *     <li>The input to this UDF must be a <b>sorted</b> bag of raw data tuples of X.
  *     An exception will be thrown if the input bag is not sorted 
  *     <li>The returned entropy value is of double type.
  * </ul>
- * </p>
+ *
  * <p>
  * How to use: 
  * </p>
+ *
  * <p>
  * This UDF calculates entropy from raw data tuples without the need to pre-compute per tuple occurrence frequency.
  * </p>
+ *
  * <p>
  * It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input.
  * </p>
+ *
  * Example:
- * <p>
  * <pre>
  * {@code
  * --calculate empirical entropy with Euler's number as the logarithm base
@@ -95,7 +99,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * }
  * }
  * </pre>
- * </p>
  * @see CondEntropy
  * @see EmpiricalCountEntropy
  */

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
index ee2c3f3..3dd4829 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
@@ -49,8 +49,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  * such as transposing two fields of the same type.  If this contract is violated, say, by attempting to reference 
  * a field that is not present, a meaningful error message may be thrown.
  * </p>
- * 
- * <p>
+ *
  * Example:  This example computes the monthly payments for mortgages depending on interest rate.
  * <pre>
  * {@code
@@ -58,11 +57,11 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  *    ...
  *    public DataBag exec(Tuple input) throws IOException {
  *      DataBag output = BagFactory.getInstance().newDefaultBag();
- *      
+ *
  *      Double principal = getDouble(input, "principal"); // get a value from the input tuple by alias
  *      Integer numPayments = getInteger(input, "num_payments");
  *      DataBag interestRates = getBag(input, "interest_rates");
- *    
+ *
  *      for (Tuple interestTuple : interestRates) {
  *        Double interest = getDouble(interestTuple, getPrefixedAliasName("interest_rates", "interest_rate"));  // get a value from the inner bag tuple by alias
  *        double monthlyPayment = computeMonthlyPayment(principal, numPayments, interest);
@@ -73,11 +72,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  *  }
  * }
  * </pre>
- * </p>
- * 
+ *
  * @author wvaughan
  *
- * @param <T>
+ * @param <T> type that the eval func returns
  */
 public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
 {
@@ -101,8 +99,8 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
   /**
    * Specify the output schema as in {link EvalFunc#outputSchema(Schema)}.
    * 
-   * @param input
-   * @return outputSchema
+   * @param input input schema
+   * @return outputSchema output schema
    */
   public abstract Schema getOutputSchema(Schema input);
 
@@ -151,10 +149,10 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
   }
   
   /**
-   * Field aliases are generated from the input schema<br/>
-   * Each alias maps to a bag position<br/>
+   * Field aliases are generated from the input schema.
+   * Each alias maps to a bag position.
    * Inner bags/tuples will have alias of outer.inner.foo
-   * 
+   *
    * @return A map of field alias to field position
    */
   public Map<String, Integer> getFieldAliases()

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
index 16f9247..27ae134 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
- 
+
 package datafu.pig.util;
 
 import java.io.IOException;
@@ -26,28 +26,26 @@ import org.apache.pig.data.Tuple;
 
 /**
  * Filter function which asserts that a value is true.
- * 
+ *
  * <p>
  * Unfortunately, the Pig interpreter doesn't recognize boolean expressions nested as function
  * arguments, so this uses C-style booleans.  That is, the first argument should be
  * an integer.  0 is interpreted as "false", and anything else is considered "true".
  * The function will cause the Pig script to fail if a "false" value is encountered.
  * </p>
- * 
+ *
  * <p>
  * There is a unary and a binary version. The unary version just takes a boolean, and throws out a generic exception message when the
  * assertion is violated.  The binary version takes a String as a second argument and throws that out when the assertion
  * is violated.
  * </p>
- * 
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
  * FILTER members BY AssertUDF( (member_id >= 0 ? 1 : 0), 'Doh! Some member ID is negative.' );
  * }
  * </pre>
- * </p>
  */
 public class AssertUDF extends FilterFunc
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
index f8e25f4..855b305 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
@@ -27,9 +27,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 
 /**
- * Returns the first non-null value from a tuple, just like {@link <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a>} in SQL. 
- * 
- * <p>
+ * Returns the first non-null value from a tuple, just like <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a> in SQL. 
+ *
  * Example:
  * <pre>
  * {@code
@@ -44,10 +43,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
  *
  * }
  * </pre>
- * </p>
- * 
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
  */
 public class Coalesce extends AliasableEvalFunc<Object>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
index c534b77..5e26ac1 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
@@ -29,8 +29,8 @@ import org.apache.pig.impl.util.UDFContext;
  * on the front end which will be available on the back end.
  * For example, properties may be set in the call to outputSchema(),
  * which will be available when exec() is called.
- * 
- * @param <T>
+ *
+ * @param <T> the type the eval function returns
  */
 public abstract class ContextualEvalFunc<T> extends EvalFunc<T>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
index 0066aa8..9aa80ff 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
@@ -60,7 +60,7 @@ public class DataFuException extends RuntimeException
   /**
    * Gets data relevant to this exception.
    * 
-   * @return data
+   * @return data the data relevant to this exception
    */
   public Object getData()
   {
@@ -70,7 +70,7 @@ public class DataFuException extends RuntimeException
   /**
    * Sets field aliases for a UDF which may be relevant to this exception.
    * 
-   * @param fieldAliases
+   * @param fieldAliases field aliases
    */
   public void setFieldAliases(Map<String, Integer> fieldAliases)
   {
@@ -79,7 +79,7 @@ public class DataFuException extends RuntimeException
 
   /**
    * Sets data relevant to this exception.
-   * @param data
+   * @param data data relevant to this exception
    */
   public void setData(Object data)
   {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
index 5057285..08d81af 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
@@ -25,26 +25,24 @@ import org.apache.pig.FilterFunc;
 import org.apache.pig.data.Tuple;
 
 /**
- * Similar to the SQL IN function, this function provides a convenient way to filter 
- * using a logical disjunction over many values. 
+ * Similar to the SQL IN function, this function provides a convenient way to filter
+ * using a logical disjunction over many values.
  * Returns true when the first value of the tuple is contained within the remainder of the tuple.
- * 
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
  * define In datafu.pig.util.InUDF();
  * -- cars: (alice, red), (bob, blue), (charlie, green), (dave, red);
  * cars = LOAD cars AS (owner:chararray, color:chararray);
- * 
+ *
  * -- cars: (alice, red), (bob, blue), (dave, red);
  * red_blue_cars = FILTER cars BY In(color, 'red', 'blue');
- * 
- * }</pre>
- * </p>
- * 
- * @author wvaughan
  *
+ * }
+ * </pre>
+ *
+ * @author wvaughan
  */
 public class InUDF extends FilterFunc
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
index f8a39df..d7c2592 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
@@ -34,8 +34,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 /**
  * Performs a transpose on a tuple, resulting in a bag of key, value fields where
  * the key is the column name and the value is the value of that column in the tuple.
- * 
- * <p>
+ *
  * Example:
  * <pre>
  * {@code
@@ -50,10 +49,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
  *
  * }
  * </pre>
- * </p>
- * 
- * @author "William Vaughan <wvaughan@linkedin.com>"
- *
  */
 public class TransposeTupleToBag extends AliasableEvalFunc<DataBag>
 {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
index ac3e409..f652101 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package datafu.test.pig.hash.lsh;
 
 import java.io.IOException;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
index be64bc8..a6615ed 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package datafu.test.pig.hash.lsh;
 
 import java.util.ArrayList;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
index 99af987..fd7ff05 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package datafu.test.pig.util;
 
 import org.adrianwalker.multilinestring.Multiline;

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle.properties
----------------------------------------------------------------------
diff --git a/gradle.properties b/gradle.properties
index 33df918..648af68 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,2 +1,4 @@
 group=org.apache.datafu
-version=1.2.1
+version=1.3.0-SNAPSHOT
+gradleVersion=1.12
+org.gradle.jvmargs="-XX:MaxPermSize=512m"

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/buildscript.gradle
----------------------------------------------------------------------
diff --git a/gradle/buildscript.gradle b/gradle/buildscript.gradle
index 225e0a8..669eb6e 100644
--- a/gradle/buildscript.gradle
+++ b/gradle/buildscript.gradle
@@ -1,7 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 repositories {
   repositories {
-    // For license plugin.
     maven {
+      // For gradle-nexus-plugin
+      url 'http://jcenter.bintray.com/'
+    }
+    maven {
+      // For license plugin.
       url 'http://dl.bintray.com/content/netflixoss/external-gradle-plugins/'
     }
   }
@@ -9,4 +32,5 @@ repositories {
 
 dependencies {
   classpath 'nl.javadude.gradle.plugins:license-gradle-plugin:0.6.1'
+  classpath 'org.gradle.api.plugins:gradle-nexus-plugin:0.7.1'
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/dependency-versions.gradle
----------------------------------------------------------------------
diff --git a/gradle/dependency-versions.gradle b/gradle/dependency-versions.gradle
index eb24e4a..3b0835f 100644
--- a/gradle/dependency-versions.gradle
+++ b/gradle/dependency-versions.gradle
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 ext {
   antlrVersion="3.2"
   avroVersion="1.7.4"

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/release.gradle
----------------------------------------------------------------------
diff --git a/gradle/release.gradle b/gradle/release.gradle
new file mode 100644
index 0000000..c52b69c
--- /dev/null
+++ b/gradle/release.gradle
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+project(':') {
+  apply plugin: 'base'
+  apply plugin: 'signing'
+}
+
+task sourceRelease(type: Tar) {
+  description = "Build a source release, specifically excluding the build directories and gradle wrapper files"
+  compression = Compression.GZIP
+
+  baseName = "datafu-sources-${project.version}-incubating"
+
+  from(project.rootDir) {
+    exclude '**/build'
+    exclude 'build'
+    exclude '.gradle'
+    exclude 'gradlew'
+    exclude 'gradlew.bat'
+    exclude 'gradle/wrapper/gradle-wrapper.jar'
+    exclude 'gradle/wrapper/gradle-wrapper.properties'
+  }
+
+  into(baseName)
+
+  // Set destination directory.
+  destinationDir = file("${project.buildDir}/distribution/source")
+
+  archiveName = "${baseName}.tgz"
+  doLast { // generate md5 checksum
+    ant.checksum file:"$destinationDir/$archiveName"
+  }
+}
+
+signing {
+  // TODO: this doesn't show up in the 'tasks' for some reason, need to figure out why.
+  // This creates a task 'signSourceRelease' that builds the source release and signs it.
+  sign sourceRelease
+}
+
+// Publishing to Apache's Maven repository (Nexus). To install the archives in the
+// local repository, run the 'install' task.
+subprojects {
+  apply plugin: 'nexus'
+
+  nexus {
+    attachSources = true
+    attachTests = false
+    attachJavadoc = true
+    sign = true
+    repositoryUrl = 'https://repository.apache.org/service/local/staging/deploy/maven2'
+    snapshotRepositoryUrl = 'https://repository.apache.org/content/repositories/snapshots'
+  }
+
+  modifyPom {
+    project {
+      name 'Apache DataFu (incubating)'
+      description 'Librares that make easier to solve data problems using Hadoop and higher level languages based on it.'
+      url 'http://datafu.incubator.apache.org/'
+
+      scm {
+        url 'https://git-wip-us.apache.org/repos/asf?p=incubator-datafu.git;a=tree'
+        connection 'scm:http://git-wip-us.apache.org/repos/asf/incubator-datafu.git'
+        developerConnection 'scm:https://git-wip-us.apache.org/repos/asf/incubator-datafu.git'
+      }
+
+      licenses {
+        license {
+          name 'The Apache Software License, Version 2.0'
+          url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
+        }
+      }
+    }
+  }
+}


Mime
View raw message