Added: datafu/site/docs/datafu/1.4.0/datafu/pig/random/package-summary.html URL: http://svn.apache.org/viewvc/datafu/site/docs/datafu/1.4.0/datafu/pig/random/package-summary.html?rev=1827525&view=auto ============================================================================== --- datafu/site/docs/datafu/1.4.0/datafu/pig/random/package-summary.html (added) +++ datafu/site/docs/datafu/1.4.0/datafu/pig/random/package-summary.html Thu Mar 22 19:01:04 2018 @@ -0,0 +1,148 @@ + + + +
+ +Class | +Description | +
---|---|
RandInt | +
+ Generates a uniformly distributed integer between two bounds.
+ |
+
RandomUUID | +
+ Generates a random UUID using java.util.UUID
+ |
+
public static class ReservoirSample.Final
+extends org.apache.pig.EvalFunc<org.apache.pig.data.DataBag>
+org.apache.pig.EvalFunc.SchemaType
log, pigLogger, reporter, returnType
Constructor and Description | +
---|
ReservoirSample.Final() |
+
ReservoirSample.Final(java.lang.String numSamples) |
+
Modifier and Type | +Method and Description | +
---|---|
org.apache.pig.data.DataBag |
+exec(org.apache.pig.data.Tuple input) |
+
allowCompileTimeCalculation, finish, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, outputSchema, progress, setInputSchema, setPigLogger, setReporter, setUDFContextSignature, warn
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
public static class ReservoirSample.Initial
+extends org.apache.pig.EvalFunc<org.apache.pig.data.Tuple>
+org.apache.pig.EvalFunc.SchemaType
Modifier and Type | +Field and Description | +
---|---|
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
+scoreGen |
+
log, pigLogger, reporter, returnType
Constructor and Description | +
---|
ReservoirSample.Initial() |
+
ReservoirSample.Initial(java.lang.String numSamples) |
+
Modifier and Type | +Method and Description | +
---|---|
org.apache.pig.data.Tuple |
+exec(org.apache.pig.data.Tuple input) |
+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
+getScoreGenerator() |
+
allowCompileTimeCalculation, finish, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, outputSchema, progress, setInputSchema, setPigLogger, setReporter, setUDFContextSignature, warn
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator scoreGen+
public ReservoirSample.Initial()+
public ReservoirSample.Initial(java.lang.String numSamples)+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator getScoreGenerator()+
public org.apache.pig.data.Tuple exec(org.apache.pig.data.Tuple input) + throws java.io.IOException+
exec
in class org.apache.pig.EvalFunc<org.apache.pig.data.Tuple>
java.io.IOException
public static class ReservoirSample.Intermediate
+extends org.apache.pig.EvalFunc<org.apache.pig.data.Tuple>
+org.apache.pig.EvalFunc.SchemaType
log, pigLogger, reporter, returnType
Constructor and Description | +
---|
ReservoirSample.Intermediate() |
+
ReservoirSample.Intermediate(java.lang.String numSamples) |
+
Modifier and Type | +Method and Description | +
---|---|
org.apache.pig.data.Tuple |
+exec(org.apache.pig.data.Tuple input) |
+
allowCompileTimeCalculation, finish, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, outputSchema, progress, setInputSchema, setPigLogger, setReporter, setUDFContextSignature, warn
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
@Nondeterministic
+public class ReservoirSample
+extends org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
+implements org.apache.pig.Algebraic
+
+ This is similar to SimpleRandomSample
, however it is guaranteed to produce
+ a sample of the given size. This comes at the cost of scalability.
+ SimpleRandomSample
produces a sample of the desired size with likelihood of 99.99%,
+ while using less internal storage. ReservoirSample on the other hand uses internal storage
+ with size equaling the desired sample to guarantee the exact sample size.
+
+ This algebraic implementation is backed by a heap and maintains the original roll in order + to compensate for skew. +
Modifier and Type | +Class and Description | +
---|---|
static class |
+ReservoirSample.Final |
+
static class |
+ReservoirSample.Initial |
+
static class |
+ReservoirSample.Intermediate |
+
org.apache.pig.EvalFunc.SchemaType
Modifier and Type | +Field and Description | +
---|---|
protected java.lang.Integer |
+numSamples |
+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
+scoreGen |
+
log, pigLogger, reporter, returnType
Constructor and Description | +
---|
ReservoirSample(java.lang.String numSamples) |
+
Modifier and Type | +Method and Description | +
---|---|
void |
+accumulate(org.apache.pig.data.Tuple input) |
+
void |
+cleanup() |
+
org.apache.pig.data.DataBag |
+exec(org.apache.pig.data.Tuple input) |
+
java.lang.String |
+getFinal() |
+
java.lang.String |
+getInitial() |
+
java.lang.String |
+getIntermed() |
+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator |
+getScoreGenerator() |
+
org.apache.pig.data.DataBag |
+getValue() |
+
org.apache.pig.impl.logicalLayer.schema.Schema |
+outputSchema(org.apache.pig.impl.logicalLayer.schema.Schema input) |
+
allowCompileTimeCalculation, finish, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, progress, setInputSchema, setPigLogger, setReporter, setUDFContextSignature, warn
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
protected java.lang.Integer numSamples+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator scoreGen+
public ReservoirSample(java.lang.String numSamples)+
protected datafu.pig.sampling.ScoredTuple.ScoreGenerator getScoreGenerator()+
public void accumulate(org.apache.pig.data.Tuple input) + throws java.io.IOException+
accumulate
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
accumulate
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
java.io.IOException
public void cleanup()+
cleanup
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
cleanup
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
public org.apache.pig.data.DataBag getValue()+
getValue
in interface org.apache.pig.Accumulator<org.apache.pig.data.DataBag>
getValue
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
public org.apache.pig.data.DataBag exec(org.apache.pig.data.Tuple input) + throws java.io.IOException+
exec
in class org.apache.pig.AccumulatorEvalFunc<org.apache.pig.data.DataBag>
java.io.IOException
public org.apache.pig.impl.logicalLayer.schema.Schema outputSchema(org.apache.pig.impl.logicalLayer.schema.Schema input)+
outputSchema
in class org.apache.pig.EvalFunc<org.apache.pig.data.DataBag>
public java.lang.String getInitial()+
getInitial
in interface org.apache.pig.Algebraic
public java.lang.String getIntermed()+
getIntermed
in interface org.apache.pig.Algebraic
public java.lang.String getFinal()+
getFinal
in interface org.apache.pig.Algebraic
public class SampleByKey
+extends org.apache.pig.FilterFunc
++ The method of sampling is to convert the key to a hash, derive a double value + from this, and then test this against a supplied probability. The double value + derived from a key is uniformly distributed between 0 and 1. +
+ ++ The only required parameter is the sampling probability. This may be followed + by an optional seed value to control the random number generation. +
+ ++ SampleByKey will work deterministically as long as the same seed is provided. +
+ + Example: + +
+ DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5');
+
+-- input: (A,1), (A,2), (A,3), (B,1), (B,3)
+
+ data = LOAD 'input' AS (A_id:chararray, B_id:chararray, C:int);
+ output = FILTER data BY SampleByKey(A_id);
+
+ --output: (B,1), (B,3)
+
+
org.apache.pig.EvalFunc.SchemaType
log, pigLogger, reporter, returnType
Constructor and Description | +
---|
SampleByKey(java.lang.String probability) |
+
SampleByKey(java.lang.String probability,
+ java.lang.String salt) |
+
Modifier and Type | +Method and Description | +
---|---|
java.lang.Boolean |
+exec(org.apache.pig.data.Tuple input) |
+
void |
+setUDFContextSignature(java.lang.String signature) |
+
finish
allowCompileTimeCalculation, getArgToFuncMapping, getCacheFiles, getInputSchema, getLogger, getPigLogger, getReporter, getReturnType, getSchemaName, getSchemaType, getShipFiles, isAsynchronous, outputSchema, progress, setInputSchema, setPigLogger, setReporter, warn
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
public SampleByKey(java.lang.String probability)+
public SampleByKey(java.lang.String probability, + java.lang.String salt)+
public void setUDFContextSignature(java.lang.String signature)+
setUDFContextSignature
in class org.apache.pig.EvalFunc<java.lang.Boolean>
public java.lang.Boolean exec(org.apache.pig.data.Tuple input) + throws java.io.IOException+
exec
in class org.apache.pig.EvalFunc<java.lang.Boolean>
java.io.IOException