http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
index d01f9fb..f7caafd 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java
@@ -22,12 +22,14 @@ package datafu.pig.hash.lsh.metric;
import org.apache.commons.math.linear.RealVector;
/**
* A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
- * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
- *
- * It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>},
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * <p>
+ * It returns one of the tuples of the bag of vectors using <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>,
* distance between two vectors. This is otherwise known as
* the Euclidean distance.
- *
+ * </p>
+ *
* @see datafu.pig.hash.lsh.L2PStableHash L2PStableHash for an example
* @author cstella
*
@@ -37,7 +39,7 @@ public class L2 extends MetricUDF {
/**
* Create a new L2 Metric UDF with a given dimension.
*
- * @param sDim
+ * @param sDim dimension
*/
public L2(String sDim) {
super(sDim);
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
index da00a60..cb6efbb 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java
@@ -34,10 +34,12 @@ import datafu.pig.hash.lsh.util.DataTypeUtil;
/**
* A base UDF used to find a vector v in a bag such that for query point q, metric m and threshold t
- * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
- *
- * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
- *
+ * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away.
+ *
+ * <p>
+ * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
+ * </p>
+ *
* @see datafu.pig.hash.lsh.CosineDistanceHash
* @author cstella
*
@@ -49,7 +51,7 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
/**
* Create a new Metric UDF with a given dimension.
*
- * @param sDim
+ * @param sDim dimension
*/
public MetricUDF(String sDim)
{
@@ -58,8 +60,8 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
/**
* The distance metric used. Given v1 and v2, compute the distance between those vectors.
- * @param v1 vector
- * @param v2 vector
+ * @param v1 first vector
+ * @param v2 second vector
* @return the distance between v1 and v2
*/
protected abstract double dist(RealVector v1, RealVector v2);
@@ -68,9 +70,11 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
* This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors.
* Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value
* in the case of sparse vectors.
- *
+ *
+ * <p>
* It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash.
- *
+ * </p>
+ *
* @see datafu.pig.hash.lsh.CosineDistanceHash
*/
@Override
@@ -109,10 +113,10 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
}
return null;
}
-
+
/**
* Create the output schema, based on the input schema.
- *
+ *
* @return the output schema, which is a tuple matching the schema of the third input field.
*/
public Schema outputSchema(Schema input) {
@@ -120,15 +124,14 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
validateInputSchema(input);
FieldSchema fieldSchema = input.getField(2);
return fieldSchema.schema;
-
- }catch (Exception e){
+ }catch (Exception e) {
throw new RuntimeException("Unable to create output schema", e);
}
}
-
+
/**
* Validate the input schema to ensure that our input is consistent and that we fail fast.
- * @param input
+ * @param input input schema
* @throws FrontendException
*/
private void validateInputSchema(Schema input) throws FrontendException
@@ -140,18 +143,18 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema);
}
}
-
+
{
FieldSchema distanceSchema = input.getField(1);
- if(distanceSchema.type != DataType.DOUBLE
- && distanceSchema.type != DataType.INTEGER
- && distanceSchema.type != DataType.LONG
+ if(distanceSchema.type != DataType.DOUBLE
+ && distanceSchema.type != DataType.INTEGER
+ && distanceSchema.type != DataType.LONG
)
{
throw new FrontendException("Invalid distance element: Expected a number, but found " + distanceSchema);
}
}
-
+
{
FieldSchema pointsSchema = input.getField(2);
if( pointsSchema.type != DataType.BAG)
@@ -166,5 +169,4 @@ public abstract class MetricUDF extends EvalFunc<Tuple>
}
}
}
-
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
index df8dbc7..ce48880 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java
@@ -18,7 +18,7 @@
*/
/**
- * UDFs for different {@link <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a>} (and some similarity functions)
+ * UDFs for different <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a> (and some similarity functions)
* used with Locality Sensitive Hashing.
*/
package datafu.pig.hash.lsh.metric;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
index 0f3ba94..21b7306 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java
@@ -29,22 +29,25 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
/**
* This is the base-class for all p-stable based locality sensitive hashes. p-stable locality sensitive
- * hashes are defined by a few parameters: a dimension, d , a vector taken from a
- * {@link <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>}
+ * hashes are defined by a few parameters: a dimension, d , a vector taken from a
+ * <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>
* (where k is 1 or 2) and a width of projection, w.
+ *
* <p>
* All p-stable LSH functions are parameterized with a quantization parameter (w or r in
* the literature , depending on where you look). Consider the following excerpt
* from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004).
* "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
* Proceedings of the Symposium on Computational Geometry.
- *
- * <pre>
- * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
+ * </p>
+ *
+ * <p>
+ * Decreasing the width of the projection (w) decreases the probability of collision for any two points.
* Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible
* and in this way decrease the number of projections we need to make.
- * </pre>
- *
+ * </p>
+ *
+ * <p>
* In the literature, the quantization parameter (or width of the projection) is
* found empirically given a sample of the data and the likely threshold for
* the metric. Tuning this parameter is very important for the performance of
@@ -52,24 +55,23 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
* P.; Mirrokni, V.S. (2004).
* "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions".
* Proceedings of the Symposium on Computational Geometry.
- *
+ * </p>
+ *
* @author cstella
- *
+ *
*/
public abstract class AbstractStableDistributionFunction extends LSH
{
-
private double[] a;
private double b;
double w;
-
/**
* Constructs a new instance.
* @param dim The dimension of the vectors to be hashed
* @param w A double representing the quantization parameter (also known as the projection width)
- * @param rand The random generator used
- * @throws MathException
+ * @param rand The random generator used
+ * @throws MathException MathException
*/
public AbstractStableDistributionFunction(int dim, double w, RandomGenerator rand) throws MathException
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
index 79bf7e5..5ac2bfc 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java
@@ -28,7 +28,7 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
/**
* A locality sensitive hash associated with the L1 metric. This uses a 1-stable distribution
* to construct the hash.
- *
+ *
* @author cstella
*
*/
@@ -36,25 +36,29 @@ public class L1LSH extends AbstractStableDistributionFunction implements Sampler
{
/**
* Constructs a new instance.
- * @throws MathException
+ *
+ * @param dim The dimension of the vectors to be hashed
+ * @param w A double representing the quantization parameter (also known as the projection width)
+ * @param rand The random generator used
+ * @throws MathException MathException
*/
- public L1LSH(int dim, double d, RandomGenerator rand) throws MathException {
- super(dim, d, rand);
+ public L1LSH(int dim, double w, RandomGenerator rand) throws MathException {
+ super(dim, w, rand);
}
/**
* Draw a sample s ~ Cauchy(0,1), which is 1-stable.
- *
+ *
+ * @param randomData random data generator
* @return a sample from a cauchy distribution with median 0 and scale 1
+ * @throws MathException MathException
*/
public double sample(RandomDataImpl randomData) throws MathException {
-
return randomData.nextCauchy(0, 1);
-
}
+
@Override
protected Sampler getSampler() {
return this;
}
-
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
index d18b189..95a487a 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java
@@ -28,15 +28,17 @@ import datafu.pig.hash.lsh.interfaces.Sampler;
/**
* A locality sensitive hash associated with the L2 metric. This uses a 2-stable distribution
* to construct the hash.
- *
- * @author cstella
*
+ * @author cstella
*/
public class L2LSH extends AbstractStableDistributionFunction implements Sampler {
/**
* Constructs a new instance.
- * @throws MathException
+ * @param dim the dimension of the vectors to be hashed
+ * @param w a double representing the quantization parameter (also known as the projection width)
+ * @param rand the random generator
+ * @throws MathException MathException
*/
public L2LSH(int dim, double w, RandomGenerator rand) throws MathException {
super(dim, w, rand);
@@ -44,7 +46,8 @@ public class L2LSH extends AbstractStableDistributionFunction implements Sampler
/**
* Draw a sample s ~ Gaussian(0,1), which is 2-stable.
- *
+ *
+ * @param randomData random data generator
* @return a sample from a Gaussian distribution with mu of 0 and sigma of 1
*/
public double sample(RandomDataImpl randomData)
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
index ec9c313..7d7bb65 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java
@@ -18,8 +18,8 @@
*/
/**
- * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} for
- * {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>}.
+ * Implementation of <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a> for
+ * <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>.
*
* See Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry.
*
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
index 045ed0d..912d72e 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java
@@ -18,6 +18,6 @@
*/
/**
- * UDFs for {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>}
+ * UDFs for <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>.
*/
package datafu.pig.hash.lsh;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
index da30179..6f09941 100644
--- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
+++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java
@@ -40,15 +40,17 @@ public enum DataTypeUtil {
/**
* Convert a tuple t into a RealVector of dimension dim.
* The tuple can be of a couple of forms:
+ *
* <ul>
* <li>A tuple composed of dim numeric types a la (1.0,2.0,3,5.0)</li>
* <li>A tuple which contains as its first element a tuple like above a la ( (1.0,2.0,3,5.0), 5) ) would yield (1.0,2.0,3,5.0)</li>
* <li>A bag containing tuples where the first element is the position and the second element is the value. This is for sparse vectors and it looks like this ( { (0,1.0), (1, 2.0), (3,3), (4,5.0) } ).</li>
* </ul>
+ *
* @param t The tuple to convert to a vector
* @param dim The dimension of the vector
* @return The actual RealVector (which may or may not be sparse)
- * @throws PigException
+ * @throws PigException PigException
*/
public RealVector convert(Tuple t, int dim) throws PigException
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
index 80ff567..423f890 100644
--- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
+++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java
@@ -39,39 +39,39 @@ import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
- * A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}.
- *
- * <p>
- * This is not a distributed implementation. Each graph is stored in memory while running the algorithm, with edges optionally
+ * A UDF which implements <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>.
+ *
+ * <p>
+ * This is not a distributed implementation. Each graph is stored in memory while running the algorithm, with edges optionally
* spilled to disk to conserve memory. This can be used to distribute the execution of PageRank on multiple
* reasonably sized graphs. It does not distribute execuion of PageRank for each individual graph. Each graph is identified
* by an integer valued topic ID.
* </p>
- *
+ *
* <p>
* If the graph is too large to fit in memory than an alternative method must be used, such as an iterative approach which runs
* many MapReduce jobs in a sequence to complete the PageRank iterations.
* </p>
- *
+ *
* <p>
* Each graph is represented through a bag of (source,edges) tuples. The 'source' is an integer ID representing the source node.
* The 'edges' are the outgoing edges from the source node, represented as a bag of (dest,weight) tuples. The 'dest' is an
* integer ID representing the destination node. The weight is a double representing how much the edge should be weighted.
* For a standard PageRank implementation just use weight of 1.0.
* </p>
- *
+ *
* <p>
* The output of the UDF is a bag of (source,rank) pairs, where 'rank' is the PageRank value for that source in the graph.
* </p>
- *
+ *
* <p>
* There are several configurable options for this UDF, among them:
- * <p>
- *
+ * </p>
+ *
* <ul>
* <li>
* <b>alpha</b>: Controls the PageRank alpha value. The default is 0.85. A higher value reduces the "random jump"
- * factor and causes the rank to be influenced more by edges.
+ * factor and causes the rank to be influenced more by edges.
* </li>
* <li>
* <b>max_iters</b>: The maximum number of iterations to run. The default is 150.
@@ -83,8 +83,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* <li>
* <b>tolerance</b>: A threshold which causes iterations to cease. It is measured from the total change in ranks from each of
* the nodes in the graph. As the ranks settle on their final values the total change decreases. This can be used
- * to stop iterations early. The default is 1e-16.
- * </li>
+ * to stop iterations early. The default is 1e-16.
+ * </li>
* <li>
* <b>max_nodes_and_edges</b>: This is a control to prevent running out of memory. As a graph is loaded, if the sum of edges
* and nodes exceeds this value then it will stop. It will not fail but PageRank will not be run on this graph. Instead a null
@@ -92,52 +92,48 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* </li>
* <li>
* <b>spill_to_edge_disk_storage</b>: Used to conserve memory. When "true" it causes the edge data to be written to disk in a temp file instead
- * of being held in memory when the number of edges exceeds a threshold. The nodes are still held in memory however.
+ * of being held in memory when the number of edges exceeds a threshold. The nodes are still held in memory however.
* Each iteration of PageRank will stream through the edges stored on disk. The default is "false".
* </li>
* <li>
* <b>max_edges_in_memory</b>: When spilling edges to disk is enabled, this is the threshold which triggers that behavior. The default is 30M.
* </li>
* </ul>
- *
+ *
* <p>
* Parameters are configured by passing them in as a sequence of pairs into the UDF constructor. For example, below the alpha value is set to
* 0.87 and dangling nodes are enabled. All arguments must be strings.
* </p>
- *
- * <p>
+ *
* <pre>
* {@code
* define PageRank datafu.pig.linkanalysis.PageRank('alpha','0.87','dangling_nodes','true');
* }
* </pre>
- * </p>
- *
- * <p>
+ *
* Full example:
* <pre>
* {@code
- *
+ *
* topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE);
- *
+ *
* topic_edges_grouped = GROUP topic_edges by (topic, source) ;
* topic_edges_grouped = FOREACH topic_edges_grouped GENERATE
* group.topic as topic,
* group.source as source,
* topic_edges.(dest,weight) as edges;
- *
- * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic;
- *
+ *
+ * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic;
+ *
* topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE
* group as topic,
* FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank);
*
* topic_ranks = FOREACH topic_ranks GENERATE
* topic, source, rank;
- *
+ *
* }
* </pre>
- * </p>
*/
public class PageRank extends AccumulatorEvalFunc<DataBag>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
index 5d0b932..7994b0e 100644
--- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
+++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java
@@ -39,9 +39,8 @@ import java.util.Map;
import com.google.common.collect.AbstractIterator;
/**
- * An implementation of {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}, used by the {@link PageRank} UDF.
- * It is not intended to be used directly.
- * </p>
+ * An implementation of <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>, used by the {@link PageRank} UDF.
+ * It is not intended to be used directly.
*/
public class PageRankImpl
{
@@ -104,7 +103,7 @@ public class PageRankImpl
/**
* Sets the page rank alpha value (default is 0.85);
- * @param alpha
+ * @param alpha page rank alpha value
*/
public void setAlpha(float alpha)
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
index de89c4a..ec8375a 100644
--- a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
+++ b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java
@@ -35,10 +35,12 @@ import datafu.pig.util.SimpleEvalFunc;
public class RandInt extends SimpleEvalFunc<Integer>
{
private final Random rand = new Random();
-
+
/**
* @param min lower bound for random number
* @param max upper bound for random number
+ * @return random integer between min and max
+ * @throws IOException IOException
*/
public Integer call(Integer min, Integer max) throws IOException
{
@@ -60,6 +62,5 @@ public class RandInt extends SimpleEvalFunc<Integer>
{
return new Schema(new Schema.FieldSchema("rand", DataType.INTEGER));
}
-
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
index 90ea576..d94a038 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java
@@ -30,24 +30,24 @@ import org.apache.pig.data.Tuple;
* This is essentially equivalent to grouping on the fields, applying SAMPLE,
* and then flattening. It is much more efficient though because it does not require
* a reduce step.
- *
+ *
* <p>
* The method of sampling is to convert the key to a hash, derive a double value
* from this, and then test this against a supplied probability. The double value
* derived from a key is uniformly distributed between 0 and 1.
* </p>
- *
+ *
* <p>
* The only required parameter is the sampling probability. This may be followed
* by an optional seed value to control the random number generation.
* </p>
- *
+ *
* <p>
* SampleByKey will work deterministically as long as the same seed is provided.
* </p>
- *
- * <p>
+ *
* Example:
+ *
* <pre>
* {@code
* DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5');
@@ -58,12 +58,10 @@ import org.apache.pig.data.Tuple;
* output = FILTER data BY SampleByKey(A_id);
*
* --output: (B,1), (B,3)
- * }
- *
+ * }
* </pre>
- * </p>
- * @author evion
- *
+ *
+ * @author evion
*/
public class SampleByKey extends FilterFunc
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
index 8e8debf..ee2e796 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java
@@ -35,57 +35,69 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Scalable simple random sampling (ScaSRS).
- * <p/>
+ *
+ * <p>
* This UDF implements a scalable simple random sampling algorithm described in
- *
+ * </p>
+ *
* <pre>
* X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013.
* </pre>
- *
+ *
+ * <p>
* It takes a bag of n items and a sampling probability p as the inputs, and outputs a
* simple random sample of size exactly ceil(p*n) in a bag, with probability at least
* 99.99%. For example, the following script generates a simple random sample with
* sampling probability 0.1:
- *
+ * </p>
+ *
* <pre>
* DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- *
- * item = LOAD 'input' AS (x:double);
+ *
+ * item = LOAD 'input' AS (x:double);
* sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01));
* </pre>
- *
+ *
+ * <p>
* Optionally, user can provide a good lower bound of n as the third argument to help
* reduce the size of intermediate data, for example:
- *
+ * </p>
+ *
* <pre>
* DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- *
- * item = LOAD 'input' AS (x:double);
+ *
+ * item = LOAD 'input' AS (x:double);
* summary = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count;
* sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01, summary.count));
* </pre>
- *
+ *
+ * <p>
* This UDF is very useful for stratified sampling. For example, the following script
* keeps all positive examples while downsampling negatives with probability 0.1:
- *
+ * </p>
+ *
* <pre>
* DEFINE SRS datafu.pig.sampling.SimpleRandomSample();
- *
+ *
* item = LOAD 'input' AS (x:double, label:int);
- * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p;
+ * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p;
* sampled = FOREACH grouped GENERATE FLATTEN(SRS(item, p));
* </pre>
- *
+ *
+ * <p>
* In a Java Hadoop MapReduce job, we can output selected items directly using
* MultipleOutputs. However, this feature is not available in a Pig UDF. So we still let
* selected items go through the sort phase. However, as long as the sample size is not
* huge, this should not be a big problem.
- *
- * In the first version, the sampling probability is specified in the constructor. This
+ * </p>
+ *
+ * <p>
+ * In the first version, the sampling probability is specified in the constructor. This
* method is deprecated now and will be removed in the next release.
- *
+ * </p>
+ *
* @author ximeng
- *
+ *
*/
public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag>
{
@@ -104,7 +116,8 @@ public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag>
/**
* Constructs this UDF with a sampling probability.
- *
+ *
+ * @param samplingProbability sampling probability
* @deprecated Should specify the sampling probability in the function call.
*/
@Deprecated
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
index 598e58c..0ae5950 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java
@@ -40,11 +40,14 @@ import com.google.common.primitives.Ints;
/**
* Scalable simple random sampling with replacement (ScaSRSWR).
- * <p/>
+ *
+ * <p>
* This UDF together with {@link SimpleRandomSampleWithReplacementElect} implement a
* scalable algorithm for simple random sampling with replacement (SRSWR), which is a
* randomized algorithm with a failure rate less than {@value #FAILURE_RATE}.
- * <p/>
+ * </p>
+ *
+ * <p>
* Let s be the desired sample size. To compute an SRSWR sample of size s, for each output
* position in {0, 1, ..., s-1}, we want to select an item from the population uniformly
* at random. This algorithm consists of two stages: vote and election. In the vote stage,
@@ -52,7 +55,9 @@ import com.google.common.primitives.Ints;
* for each position. In the election stage, the paired UDF
* {@link SimpleRandomSampleWithReplacementElect} elects one candidate for each position.
* The algorithm succeeds if we have at least one candidate for each position.
- * <p/>
+ * </p>
+ *
+ * <p>
* To use this UDF pair, user needs to provide: 1) the desired sample size, 2) a good
* lower bound of the population size or the exact size. The input to the vote UDF
* {@link SimpleRandomSampleWithReplacementVote} is a tuple that consists of a bag of
@@ -62,47 +67,59 @@ import com.google.common.primitives.Ints;
* elect UDF {@link SimpleRandomSampleWithReplacementElect} is a tuple that contains all
* candidates voted by the vote UDF for some positions. The output from the elect UDF is a
* bag of sampled items.
- * <p/>
+ * </p>
+ *
+ * <p>
* For example, the following script generates a sample of size 100000 with replacement:
- *
+ * </p>
+ *
* <pre>
* DEFINE SRSWR_VOTE datafu.pig.sampling.SimpleRandomSampleWithReplacementVote();
* DEFINE SRSWR_ELECT datafu.pig.sampling.SimpleRandomSampleWithReplacementElect();
- *
- * item = LOAD 'input' AS (x:double);
+ *
+ * item = LOAD 'input' AS (x:double);
* summary = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count;
* candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), 100000, summary.count));
* sampled = FOREACH (GROUP candidates BY position PARALLEL 10) GENERATE FLATTEN(SRSWR_ELECT(candidates));
* </pre>
- *
+ *
+ * <p>
* Because for election we only need to group candidates voted for the same position, this
* algorithm can use many reducers to consume the candidates. See the "PARALLEL 10"
* statement above. If the item to sample is the entire row, use TOBAG(TOTUPLE(*)).
- * <p/>
+ * </p>
+ *
+ * <p>
* SRSWR is heavily used in bootstrapping. Bootstrapping can be done easily with this UDF
* pair. For example, the following script generates 100 bootstrap samples, computes the
* mean value for each sample, and then outputs the bootstrap estimates.
- *
+ * </p>
+ *
* <pre>
* summary = FOREACH (GROUP item ALL) GENERATE AVG(item.x) AS mean, COUNT(item) AS count;
* candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), summary.count*100, summary.count));
* sampled = FOREACH (GROUP candidates BY (position % 100) PARALLEL 10) GENERATE AVG(SRSWR_ELECT(candidates)) AS mean;
* bootstrap = FOREACH (GROUP sampled ALL) GENERATE summary.mean AS mean, sampled.mean AS bootstrapMeans;
* </pre>
- *
+ *
+ * <p>
* Another usage of this UDF pair is to generate random pairs or tuples without computing
* the cross product, where each pair or tuple consist of items from different input
* sources. Let s be the number of random tuples we want to generate. For each input
* source, simply use the vote UDF to propose candidates, then join the candidates from
* different sources by their positions and for each position use the elect UDF to select
* one candidate from each source to form the pair or tuple for that position.
- * <p/>
+ * </p>
+ *
+ * <p>
* The algorithm is a simple extension to the work
- *
+ * </p>
+ *
* <pre>
* X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013.
* </pre>
- *
+ *
+ * <p>
* Basically, for each output position, it performs a random sort on the population
* (associates each item with a random score independently drawn from the uniform
* distribution and then sorts items based on the scores), and picks the one that has the
@@ -110,34 +127,41 @@ import com.google.common.primitives.Ints;
* population. For example, if the population size is one billion and the random score
* generated for an item is 0.9, very likely it won't become the smallest and hence we do
* not need to propose it as a candidate.
- * <p/>
+ * </p>
+ *
+ * <p>
* More precisely, let n be the population size, n1 be a good lower bound of n, s be the
* sample size, delta be the failure rate, and q be the threshold. For each output
* position the probability of all random scores being greater than q is (1-q)^n. Thus, if
* we throw away items with associated scores greater than q, with probability at least 1
* - s*(1-q)^n, we can still capture the item with the smallest score for each position.
* Fix delta = s*(1-q)^n and solve for q, we get q = 1-exp(log(delta/s)/n), Note that
- * replacing n by n1 < n can only decrease the failure rate, though at the cost of
+ * replacing n by n1 < n can only decrease the failure rate, though at the cost of
* increased number of candidates. The expected number of candidates is (1 -
* exp(log(delta/s)/n1)*s*n. When n1 equals n, this number is approximately
* s*log(s/delta).
- * <p/>
+ * </p>
+ *
+ * <p>
* Generating a random score for each (item, position) pair is very expensive and
* unnecessary. For each item, the number of positions for which it gets voted follows a
* binomial distribution B(s,q). We can simply draw a number from this distribution,
* determine the positions by sampling without replacement, and then generate random
* scores for those positions. This reduces the running time significantly.
- * <p/>
+ * </p>
+ *
+ * <p>
* Since for each position we only need the candidate with the smallest score, we
* implement a combiner to reduce the size of intermediate data in the elect UDF
+ * </p>
+ *
* {@link SimpleRandomSampleWithReplacementElect}.
- *
+ *
* @see SimpleRandomSampleWithReplacementElect
- * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics) target="_blank
- * ">Boostrapping (Wikipedia)</a>
- *
+ * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics)" target="_blank">Boostrapping (Wikipedia)</a>
+ *
* @author ximeng
- *
+ *
*/
public class SimpleRandomSampleWithReplacementVote extends EvalFunc<DataBag>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
index 92af6a3..a5d265c 100644
--- a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
+++ b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java
@@ -29,28 +29,27 @@ import org.apache.pig.builtin.Nondeterministic;
import org.apache.pig.backend.executionengine.ExecException;
/**
- * <p>
* Performs a weighted random sample using an in-memory reservoir to produce
* a weighted random sample of a given size based on the A-Res algorithm described in
- * {@link <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>}.
- * </p>
+ * <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>.
+ *
* <p>
* Species with larger weight have higher probability to be selected in the final sample set.
* </p>
+ *
* <p>
* This UDF inherits from {@link ReservoirSample} and it is guaranteed to produce
* a sample of the given size. Similarly it comes at the cost of scalability.
* since it uses internal storage with size equaling the desired sample to guarantee the exact sample size.
* </p>
- * <p>
- * Its constructor takes 2 arguments.
+ *
+ * Its constructor takes 2 arguments:
* <ul>
* <li>The 1st argument specifies the sample size which should be a string of positive integer.
* <li>The 2nd argument specifies the index of the weight field in the input tuple,
* which should be a string of non-negative integer that is no greater than the input tuple size.
* </ul>
- * </p>
- * <p>
+ *
* Example:
* <pre>
* {@code
@@ -60,7 +59,6 @@ import org.apache.pig.backend.executionengine.ExecException;
* sampled = FOREACH input_g GENERATE WeightedSample(input);
* }
* </pre>
- * </p>
* @author wjian
*/
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
index 52d159b..d81fb48 100644
--- a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
+++ b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java
@@ -47,13 +47,12 @@ import org.joda.time.Period;
* session_id, that is a GUID indicating the session of the request.
* </p>
*
- * <p>
* Example:
* <pre>
* {@code
- *
+ *
* %declare TIME_WINDOW 30m
- *
+ *
* define Sessionize datafu.pig.sessions.Sessionize('$TIME_WINDOW');
*
* views = LOAD 'views.tsv' AS (visit_date:chararray, member_id:int, url:chararray);
@@ -62,7 +61,7 @@ import org.joda.time.Period;
* views = GROUP views BY member_id;
* sessions = FOREACH views {
* visits = ORDER views BY visit_date;
- * GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id);
+ * GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id);
* }
*
* -- count the number of sessions hitting the url
@@ -70,7 +69,6 @@ import org.joda.time.Period;
* result = FOREACH rollup GENERATE group AS url, COUNT(SESSIONS) AS session_cnt;
* }
* </pre>
- * </p>
*/
@Nondeterministic
public class Sessionize extends AccumulatorEvalFunc<DataBag>
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
index c9997f8..90faa1e 100644
--- a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
+++ b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java
@@ -26,9 +26,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
* Base class for set operations.
- *
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
*/
public abstract class SetOperationsBase extends EvalFunc<DataBag>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Median.java b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
index e33a84e..5f9d18d 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/Median.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/Median.java
@@ -16,18 +16,18 @@
* specific language governing permissions and limitations
* under the License.
*/
-
+
package datafu.pig.stats;
/**
- * Computes the {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>}
+ * Computes the <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>
* for a <b>sorted</b> input bag, using type R-2 estimation. This is a convenience wrapper around Quantile.
*
* <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
* done (e.g., group by 'day') if the data is too large. That is, this isn't distributed median.
* </p>
- *
+ *
* @see Quantile
*/
public class Median extends Quantile
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
index 6fd42d3..89621ea 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java
@@ -35,7 +35,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import datafu.pig.util.SimpleEvalFunc;
/**
- * Computes {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>}
+ * Computes <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>
* for a <b>sorted</b> input bag, using type R-2 estimation.
*
* <p>
@@ -74,7 +74,6 @@ import datafu.pig.util.SimpleEvalFunc;
* <li>Quantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles
* </ul>
*
- * <p>
* Example:
* <pre>
* {@code
@@ -91,7 +90,7 @@ import datafu.pig.util.SimpleEvalFunc;
* sorted = ORDER input BY val;
* GENERATE Quantile(sorted);
* }
- * }</pre></p>
+ * }</pre>
*
* @see Median
* @see StreamingQuantile
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
index c6fd36a..e7ba3c9 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java
@@ -23,9 +23,6 @@ import java.util.ArrayList;
/**
* Methods used by {@link Quantile}.
- *
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
*/
public class QuantileUtil
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
index c4c3be4..28d887d 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java
@@ -20,15 +20,15 @@
package datafu.pig.stats;
/**
- * Computes the approximate {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>}
- * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
+ * Computes the approximate <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>
+ * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
* This is a convenience wrapper around StreamingQuantile.
*
* <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
* done (e.g., group by 'day') if the data is too large. That is, this isn't distributed median.
* </p>
- *
+ *
* @see StreamingQuantile
*/
public class StreamingMedian extends StreamingQuantile
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
index e4a65b4..2e36941 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java
@@ -35,21 +35,21 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
/**
- * Computes approximate {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>}
+ * Computes approximate <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>
* for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm.
*
* <p>
* The algorithm is described here:
- * {@link <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>}
+ * <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>
* </p>
*
* <p>
* The implementation is based on the one in Sawzall, available here:
- * {@link <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>}
+ * <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>
* </p>
*
* <p>
- * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
+ * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is
* done (e.g., group by 'day') if the data is too large. That is, this isn't distributed quantiles.
* </p>
*
@@ -95,12 +95,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
* GCD of 0.2, 0.7, and 1.0.</li>
* <li>If 0.999 is requested the quantiles 0.0, 0.001, 0.002, ... , 0.998, 0.999, 1.0 are computed because 0.001 is
* the GCD of 0.999 and 1.0.</li>
- * </p>
* </ul>
- *
+ *
* <p>The error on the approximation goes down as the number of buckets computed goes up.</p>
- *
- * <p>
+ *
* Example:
* <pre>
* {@code
@@ -115,7 +113,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
* -- produces: (1.0,3.0,5.0,8.0,10.0)
* quantiles = FOREACH grouped generate Quantile(input);
* }
- * </pre></p>
+ * </pre>
*
* @see StreamingMedian
* @see Quantile
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
index 6f22f25..dd18c56 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java
@@ -40,25 +40,23 @@ import org.apache.pig.backend.executionengine.ExecException;
/**
-* Generates the {@link <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a>}
+* Generates the <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a>
* of a set of Values. This UDF uses the fact that variance(x) = average(x^2) - average(x)^2
* This class implements * {@link org.apache.pig.Algebraic}, so if possible the execution will performed in a distributed fashion.
* VAR implements the {@link org.apache.pig.Accumulator} interface as well.
-*
+*
* Input: Bag of int, long, double, float or bytearray
* Output: Double
-*
-* <p>
+*
* Example:
* <pre>
* define VAR datafu.pig.stats.VAR();
-*
+*
* -- input: 1,2,3,4,10,5,6,7,8,9
* input = LOAD 'input' AS (val:int);
* grouped = GROUP input ALL;
* variance = FOREACH grouped GENERATE VAR(input.val) AS variance;
* </pre>
-* </p>
*/
public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Double> {
private static TupleFactory mTupleFactory = TupleFactory.getInstance();
@@ -68,7 +66,7 @@ public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Doub
try {
Double sum = sum(input);
Double sumSquare = sumSquare(input);
-
+
if(sum == null) {
// either we were handed an empty bag or a bag
// filled with nulls - return null in this case
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
index 1448611..85a8df7 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java
@@ -35,13 +35,15 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import datafu.pig.util.SimpleEvalFunc;
/**
- * Computes the {@link <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>}
+ * Computes the <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>.
+ *
* <p>
* Constructor requires the confidence interval (alpha) parameter, and the
* parameters are the number of positive (success) outcomes and the total
* number of observations. The UDF returns the (lower,upper) confidence
- * interval.
- * <p>
+ * interval.
+ * </p>
+ *
* Example:
* <pre>
* {@code
@@ -54,7 +56,7 @@ import datafu.pig.util.SimpleEvalFunc;
* quux = ORDER bar BY score DESC;
* top = LIMIT quux 10;
* }
- * </pre></p>
+ * </pre>
*/
public class WilsonBinConf extends SimpleEvalFunc<Tuple>
{
@@ -82,6 +84,7 @@ public class WilsonBinConf extends SimpleEvalFunc<Tuple>
* @param x The number of positive (success) outcomes
* @param n The number of observations
* @return The (lower,upper) confidence interval
+ * @throws IOException IOException
*/
public Tuple binconf(Long x, Long n) throws IOException
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
index 26b743e..2f0f148 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java
@@ -34,34 +34,38 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
- * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's
- * {@link <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>},
+ * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's
+ * <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>,
* X is the conditional variable and Y is the variable that conditions on X.
+ *
* <p>
* Each tuple of the input bag has 2 fields, the 1st field is an object instance of variable X and
* the 2nd field is an object instance of variable Y. An exception will be thrown if the number of fields is not 2.
- * </p>
+ * </p>
+ *
* <p>
* This UDF's constructor definition and parameters are the same as that of {@link datafu.pig.stats.entropy.Entropy}
* </p>
- * <p>
+ *
* Note:
* <ul>
* <li>The input bag to this UDF must be <b>sorted</b> on X and Y, with X in the first sort order.
* An exception will be thrown if the input bag is not sorted.
* <li>The returned entropy value is of double type.
* </ul>
- * </p>
+ *
* <p>
- * How to use:
+ * How to use:
* </p>
+ *
* <p>
* This UDF calculates conditional entropy given raw data tuples of X and Y without the need to pre-compute per tuple occurrence frequency.
* </p>
+ *
* <p>
* It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input.
* </p>
- * <p>
+ *
* Example:
* <pre>
* {@code
@@ -79,21 +83,20 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* }
* }
* </pre>
- * </p>
+ *
* Use case to calculate mutual information:
- * <p>
* <pre>
* {@code
* ------------
* -- calculate mutual information I(X, Y) using conditional entropy UDF and entropy UDF
* -- I(X, Y) = H(Y) - H(Y|X)
* ------------
- *
+ *
* define CondEntropy datafu.pig.stats.entropy.CondEntropy();
* define Entropy datafu.pig.stats.entropy.Entropy();
- *
+ *
* input = LOAD 'input' AS (grp: chararray, valX: double, valY: double);
- *
+ *
* -- calculate the I(X,Y) in each group
* input_group_g = GROUP input BY grp;
* mutual_information = FOREACH input_group_g {
@@ -107,7 +110,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* }
* }
* </pre>
- * </p>
* @see Entropy
*/
public class CondEntropy extends AccumulatorEvalFunc<Double> {
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
index 388b80f..1a6b846 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java
@@ -41,37 +41,40 @@ import datafu.pig.stats.entropy.EntropyUtil;
/**
* Calculate the empirical entropy of random variable X given its occurrence frequencies, following entropy's
- * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>}
+ * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>.
+ *
* <p>
* This UDF's constructor takes 1 argument: the logarithm base, whose definition is the same as that defined in {@link datafu.pig.stats.entropy.Entropy}
* </p>
- * <p>
- * Note:
+ *
+ * Note:
* <ul>
* <li>Unlike {@link datafu.pig.stats.entropy.Entropy}, which calculates entropy from sorted raw data bag in accumulative mode,
- * this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode.
+ * this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode.</li>
* <li>Each tuple of the UDF's input bag <b>must only</b> have 1 field, the occurrence frequency of a data instance,
- * and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown.
- * <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file.
- * <li>The returned entropy value is of double type.
+ * and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown.</li>
+ * <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file.</li>
+ * <li>The returned entropy value is of double type.</li>
* </ul>
- * </p>
+ *
* <p>
- * How to use:
+ * How to use:
* </p>
+ *
* <p>
* To use this UDF, customer needs to pre-compute the occurrence frequency of each data instance, often in an outer GROUP BY
* , and then use this UDF to calculate entropy with those frequency numbers in another outer GROUP BY.
* </p>
+ *
* <p>
* Compared with {@link datafu.pig.stats.entropy.Entropy}, this UDF is more scalable when we need to handle a very large data set,
* since it could distribute computation onto mappers and take advantage of combiners to reduce intermedidate output from mappers to reducers.
* </p>
- * <p>
+ *
* Example:
* <pre>
* {@code
- *
+ *
* define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
*
* input = LOAD 'input' AS (val: double);
@@ -79,48 +82,48 @@ import datafu.pig.stats.entropy.EntropyUtil;
* -- calculate the occurrence of each instance
* counts_g = GROUP input BY val;
* counts = FOREACh counts_g GENERATE COUNT(input) AS cnt;
- *
- * -- calculate entropy
+ *
+ * -- calculate entropy
* input_counts_g = GROUP counts ALL;
* entropy = FOREACH input_counts_g GENERATE Entropy(counts) AS entropy;
* }
* </pre>
- * </p>
+ *
* Use case to calculate mutual information using EmpiricalCountEntropy:
- * <p>
+ *
* <pre>
* {@code
- *
+ *
* define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy();
- *
+ *
* input = LOAD 'input' AS (valX: double, valY: double);
- *
+ *
* ------------
* -- calculate mutual information I(X, Y) using entropy
* -- I(X, Y) = H(X) + H(Y) - H(X, Y)
* ------------
- *
+ *
* input_x_y_g = GROUP input BY (valX, valY);
* input_x_y_cnt = FOREACH input_x_y_g GENERATE flatten(group) as (valX, valY), COUNT(input) AS cnt;
- *
+ *
* input_x_g = GROUP input_x_y_cnt BY valX;
* input_x_cnt = FOREACH input_x_g GENERATE flatten(group) as valX, SUM(input_x_y_cnt.cnt) AS cnt;
- *
+ *
* input_y_g = GROUP input_x_y_cnt BY valY;
* input_y_cnt = FOREACH input_y_g GENERATE flatten(group) as valY, SUM(input_x_y_cnt.cnt) AS cnt;
- *
+ *
* input_x_y_entropy_g = GROUP input_x_y_cnt ALL;
* input_x_y_entropy = FOREACH input_x_y_entropy_g {
* input_x_y_entropy_cnt = input_x_y_cnt.cnt;
* GENERATE Entropy(input_x_y_entropy_cnt) AS x_y_entropy;
* }
- *
+ *
* input_x_entropy_g = GROUP input_x_cnt ALL;
* input_x_entropy = FOREACH input_x_entropy_g {
* input_x_entropy_cnt = input_x_cnt.cnt;
* GENERATE Entropy(input_x_entropy_cnt) AS x_entropy;
* }
- *
+ *
* input_y_entropy_g = GROUP input_y_cnt ALL;
* input_y_entropy = FOREACH input_y_entropy_g {
* input_y_entropy_cnt = input_y_cnt.cnt;
@@ -133,7 +136,6 @@ import datafu.pig.stats.entropy.EntropyUtil;
* input_x_y_entropy::x_y_entropy) AS mi;
* }
* </pre>
- * </p>
* @see Entropy
*/
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
index 9dfff1a..efa1d35 100644
--- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
+++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java
@@ -32,53 +32,57 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
- * Calculate entropy H(X) of random variable X following entropy's
- * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>}
+ * Calculate entropy H(X) of random variable X following entropy's
+ * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>
+ *
* <p>
* This UDF's constructor takes 2 arguments.
* </p>
- * <p>
+ *
* The 1st argument, the type of entropy estimator algorithm we currently support, includes:
* <ul>
* <li>empirical (empirical entropy estimator)
* <li>chaosh (Chao-Shen entropy estimator)
* </ul>
- * </p>
+ *
* <p>
* The default estimation algorithm is empirical.
* </p>
+ *
* <p>
* The 2nd argument, the logarithm base we currently support, includes:
* </p>
- * <p>
+ *
* <ul>
* <li>log (use Euler's number as the logarithm base)
* <li>log2 (use 2 as the logarithm base)
* <li>log10 (use 10 as the logarithm base)
* </ul>
- * </p>
+ *
* <p>
* The default logarithm base is log.
* </p>
- * <p>
+ *
* Note:
* <ul>
* <li>The input to this UDF must be a <b>sorted</b> bag of raw data tuples of X.
* An exception will be thrown if the input bag is not sorted
* <li>The returned entropy value is of double type.
* </ul>
- * </p>
+ *
* <p>
* How to use:
* </p>
+ *
* <p>
* This UDF calculates entropy from raw data tuples without the need to pre-compute per tuple occurrence frequency.
* </p>
+ *
* <p>
* It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input.
* </p>
+ *
* Example:
- * <p>
* <pre>
* {@code
* --calculate empirical entropy with Euler's number as the logarithm base
@@ -95,7 +99,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* }
* }
* </pre>
- * </p>
* @see CondEntropy
* @see EmpiricalCountEntropy
*/
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
index ee2c3f3..3dd4829 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java
@@ -49,8 +49,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* such as transposing two fields of the same type. If this contract is violated, say, by attempting to reference
* a field that is not present, a meaningful error message may be thrown.
* </p>
- *
- * <p>
+ *
* Example: This example computes the monthly payments for mortgages depending on interest rate.
* <pre>
* {@code
@@ -58,11 +57,11 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* ...
* public DataBag exec(Tuple input) throws IOException {
* DataBag output = BagFactory.getInstance().newDefaultBag();
- *
+ *
* Double principal = getDouble(input, "principal"); // get a value from the input tuple by alias
* Integer numPayments = getInteger(input, "num_payments");
* DataBag interestRates = getBag(input, "interest_rates");
- *
+ *
* for (Tuple interestTuple : interestRates) {
* Double interest = getDouble(interestTuple, getPrefixedAliasName("interest_rates", "interest_rate")); // get a value from the inner bag tuple by alias
* double monthlyPayment = computeMonthlyPayment(principal, numPayments, interest);
@@ -73,11 +72,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
* }
* }
* </pre>
- * </p>
- *
+ *
* @author wvaughan
*
- * @param <T>
+ * @param <T> type that the eval func returns
*/
public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
{
@@ -101,8 +99,8 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
/**
* Specify the output schema as in {link EvalFunc#outputSchema(Schema)}.
*
- * @param input
- * @return outputSchema
+ * @param input input schema
+ * @return outputSchema output schema
*/
public abstract Schema getOutputSchema(Schema input);
@@ -151,10 +149,10 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T>
}
/**
- * Field aliases are generated from the input schema<br/>
- * Each alias maps to a bag position<br/>
+ * Field aliases are generated from the input schema.
+ * Each alias maps to a bag position.
* Inner bags/tuples will have alias of outer.inner.foo
- *
+ *
* @return A map of field alias to field position
*/
public Map<String, Integer> getFieldAliases()
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
index 16f9247..27ae134 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java
@@ -16,7 +16,7 @@
* specific language governing permissions and limitations
* under the License.
*/
-
+
package datafu.pig.util;
import java.io.IOException;
@@ -26,28 +26,26 @@ import org.apache.pig.data.Tuple;
/**
* Filter function which asserts that a value is true.
- *
+ *
* <p>
* Unfortunately, the Pig interpreter doesn't recognize boolean expressions nested as function
* arguments, so this uses C-style booleans. That is, the first argument should be
* an integer. 0 is interpreted as "false", and anything else is considered "true".
* The function will cause the Pig script to fail if a "false" value is encountered.
* </p>
- *
+ *
* <p>
* There is a unary and a binary version. The unary version just takes a boolean, and throws out a generic exception message when the
* assertion is violated. The binary version takes a String as a second argument and throws that out when the assertion
* is violated.
* </p>
- *
- * <p>
+ *
* Example:
* <pre>
* {@code
* FILTER members BY AssertUDF( (member_id >= 0 ? 1 : 0), 'Doh! Some member ID is negative.' );
* }
* </pre>
- * </p>
*/
public class AssertUDF extends FilterFunc
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
index f8e25f4..855b305 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java
@@ -27,9 +27,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
/**
- * Returns the first non-null value from a tuple, just like {@link <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a>} in SQL.
- *
- * <p>
+ * Returns the first non-null value from a tuple, just like <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a> in SQL.
+ *
* Example:
* <pre>
* {@code
@@ -44,10 +43,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
*
* }
* </pre>
- * </p>
- *
- * @author "Matthew Hayes <mhayes@linkedin.com>"
- *
*/
public class Coalesce extends AliasableEvalFunc<Object>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
index c534b77..5e26ac1 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java
@@ -29,8 +29,8 @@ import org.apache.pig.impl.util.UDFContext;
* on the front end which will be available on the back end.
* For example, properties may be set in the call to outputSchema(),
* which will be available when exec() is called.
- *
- * @param <T>
+ *
+ * @param <T> the type the eval function returns
*/
public abstract class ContextualEvalFunc<T> extends EvalFunc<T>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
index 0066aa8..9aa80ff 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java
@@ -60,7 +60,7 @@ public class DataFuException extends RuntimeException
/**
* Gets data relevant to this exception.
*
- * @return data
+ * @return data the data relevant to this exception
*/
public Object getData()
{
@@ -70,7 +70,7 @@ public class DataFuException extends RuntimeException
/**
* Sets field aliases for a UDF which may be relevant to this exception.
*
- * @param fieldAliases
+ * @param fieldAliases field aliases
*/
public void setFieldAliases(Map<String, Integer> fieldAliases)
{
@@ -79,7 +79,7 @@ public class DataFuException extends RuntimeException
/**
* Sets data relevant to this exception.
- * @param data
+ * @param data data relevant to this exception
*/
public void setData(Object data)
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
index 5057285..08d81af 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java
@@ -25,26 +25,24 @@ import org.apache.pig.FilterFunc;
import org.apache.pig.data.Tuple;
/**
- * Similar to the SQL IN function, this function provides a convenient way to filter
- * using a logical disjunction over many values.
+ * Similar to the SQL IN function, this function provides a convenient way to filter
+ * using a logical disjunction over many values.
* Returns true when the first value of the tuple is contained within the remainder of the tuple.
- *
- * <p>
+ *
* Example:
* <pre>
* {@code
* define In datafu.pig.util.InUDF();
* -- cars: (alice, red), (bob, blue), (charlie, green), (dave, red);
* cars = LOAD cars AS (owner:chararray, color:chararray);
- *
+ *
* -- cars: (alice, red), (bob, blue), (dave, red);
* red_blue_cars = FILTER cars BY In(color, 'red', 'blue');
- *
- * }</pre>
- * </p>
- *
- * @author wvaughan
*
+ * }
+ * </pre>
+ *
+ * @author wvaughan
*/
public class InUDF extends FilterFunc
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
index f8a39df..d7c2592 100644
--- a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
+++ b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java
@@ -34,8 +34,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
/**
* Performs a transpose on a tuple, resulting in a bag of key, value fields where
* the key is the column name and the value is the value of that column in the tuple.
- *
- * <p>
+ *
* Example:
* <pre>
* {@code
@@ -50,10 +49,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
*
* }
* </pre>
- * </p>
- *
- * @author "William Vaughan <wvaughan@linkedin.com>"
- *
*/
public class TransposeTupleToBag extends AliasableEvalFunc<DataBag>
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
index ac3e409..f652101 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package datafu.test.pig.hash.lsh;
import java.io.IOException;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
index be64bc8..a6615ed 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package datafu.test.pig.hash.lsh;
import java.util.ArrayList;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
index 99af987..fd7ff05 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
package datafu.test.pig.util;
import org.adrianwalker.multilinestring.Multiline;
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle.properties
----------------------------------------------------------------------
diff --git a/gradle.properties b/gradle.properties
index 33df918..648af68 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,2 +1,4 @@
group=org.apache.datafu
-version=1.2.1
+version=1.3.0-SNAPSHOT
+gradleVersion=1.12
+org.gradle.jvmargs="-XX:MaxPermSize=512m"
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/buildscript.gradle
----------------------------------------------------------------------
diff --git a/gradle/buildscript.gradle b/gradle/buildscript.gradle
index 225e0a8..669eb6e 100644
--- a/gradle/buildscript.gradle
+++ b/gradle/buildscript.gradle
@@ -1,7 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
repositories {
repositories {
- // For license plugin.
maven {
+ // For gradle-nexus-plugin
+ url 'http://jcenter.bintray.com/'
+ }
+ maven {
+ // For license plugin.
url 'http://dl.bintray.com/content/netflixoss/external-gradle-plugins/'
}
}
@@ -9,4 +32,5 @@ repositories {
dependencies {
classpath 'nl.javadude.gradle.plugins:license-gradle-plugin:0.6.1'
+ classpath 'org.gradle.api.plugins:gradle-nexus-plugin:0.7.1'
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/dependency-versions.gradle
----------------------------------------------------------------------
diff --git a/gradle/dependency-versions.gradle b/gradle/dependency-versions.gradle
index eb24e4a..3b0835f 100644
--- a/gradle/dependency-versions.gradle
+++ b/gradle/dependency-versions.gradle
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
ext {
antlrVersion="3.2"
avroVersion="1.7.4"
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/release.gradle
----------------------------------------------------------------------
diff --git a/gradle/release.gradle b/gradle/release.gradle
new file mode 100644
index 0000000..c52b69c
--- /dev/null
+++ b/gradle/release.gradle
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+project(':') {
+ apply plugin: 'base'
+ apply plugin: 'signing'
+}
+
+task sourceRelease(type: Tar) {
+ description = "Build a source release, specifically excluding the build directories and gradle wrapper files"
+ compression = Compression.GZIP
+
+ baseName = "datafu-sources-${project.version}-incubating"
+
+ from(project.rootDir) {
+ exclude '**/build'
+ exclude 'build'
+ exclude '.gradle'
+ exclude 'gradlew'
+ exclude 'gradlew.bat'
+ exclude 'gradle/wrapper/gradle-wrapper.jar'
+ exclude 'gradle/wrapper/gradle-wrapper.properties'
+ }
+
+ into(baseName)
+
+ // Set destination directory.
+ destinationDir = file("${project.buildDir}/distribution/source")
+
+ archiveName = "${baseName}.tgz"
+ doLast { // generate md5 checksum
+ ant.checksum file:"$destinationDir/$archiveName"
+ }
+}
+
+signing {
+ // TODO: this doesn't show up in the 'tasks' for some reason, need to figure out why.
+ // This creates a task 'signSourceRelease' that builds the source release and signs it.
+ sign sourceRelease
+}
+
+// Publishing to Apache's Maven repository (Nexus). To install the archives in the
+// local repository, run the 'install' task.
+subprojects {
+ apply plugin: 'nexus'
+
+ nexus {
+ attachSources = true
+ attachTests = false
+ attachJavadoc = true
+ sign = true
+ repositoryUrl = 'https://repository.apache.org/service/local/staging/deploy/maven2'
+ snapshotRepositoryUrl = 'https://repository.apache.org/content/repositories/snapshots'
+ }
+
+ modifyPom {
+ project {
+ name 'Apache DataFu (incubating)'
+ description 'Librares that make easier to solve data problems using Hadoop and higher level languages based on it.'
+ url 'http://datafu.incubator.apache.org/'
+
+ scm {
+ url 'https://git-wip-us.apache.org/repos/asf?p=incubator-datafu.git;a=tree'
+ connection 'scm:http://git-wip-us.apache.org/repos/asf/incubator-datafu.git'
+ developerConnection 'scm:https://git-wip-us.apache.org/repos/asf/incubator-datafu.git'
+ }
+
+ licenses {
+ license {
+ name 'The Apache Software License, Version 2.0'
+ url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
+ }
+ }
+ }
+ }
+}
|