mahout-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Andrew Musselman <andrew.mussel...@gmail.com>
Subject Re: Random forest
Date Thu, 19 Sep 2013 22:23:01 GMT
I'm guessing the sparsity is the problem..


On Thu, Sep 19, 2013 at 1:27 PM, Andrew Musselman <
andrew.musselman@gmail.com> wrote:

> Train and test sets both have the label gt_yr or lt_yr for how long
> they've been w/ the company.  There's a first row that looks suspicious in
> the train set so I'm going to remove that but for now here are some details
> of what's going on.
>
> *So the question is "why are both entries in the a column valued at zero?"
> *
>
> /train/part-r-00000 | head
>
> 0,70463,0,0,0,42,0,0,0,80,0,0,0,78,0,0,0,76,0,0,0,72,0,0,0,68,0,0,0,64,0,lt_yr
> 77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 77,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 586,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 108,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
>
> /test/part-r-00000 | head
> 95,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 51,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
> 0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gt_yr
> 135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gt_yr
> 73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,gt_yr
> 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,lt_yr
>
> $ hcat /train/part-r-00000 | cut -f 32 -d, | sort | uniq -c
> 7222544 gt_yr
> 1735960 lt_yr
> $ hcat /train/part-r-00001 | cut -f 32 -d, | sort | uniq -c
> 7221423 gt_yr
> 1732293 lt_yr
> $ hcat /test/part-r-00000 | cut -f 32 -d, | sort | uniq -c
> 4801068 gt_yr
> 1155648 lt_yr
> $ hcat /test/part-r-00001 | cut -f 32 -d, | sort | uniq -c
> 4801668 gt_yr
> 1155756 lt_yr
>
>
> #!/bin/bash
> export
> MC=/opt/cloudera/parcels/CDH-4.4.0-1.cdh4.4.0.p0.39/lib/mahout/mahout-core-0.7-cdh4.4.0-job.jar
> export
> ME=/opt/cloudera/parcels/CDH-4.4.0-1.cdh4.4.0.p0.39/lib/mahout/mahout-examples-0.7-cdh4.4.0-job.jar
>
> # Describe input file schema
> hadoop jar $MC org.apache.mahout.classifier.df.tools.Describe -p
> /train/part-r-00000 -f /info -d 31 N L
>
> # Train rf model
> hadoop jar $ME org.apache.mahout.classifier.df.mapreduce.BuildForest -d
> /train/part-r-00000 -ds /info -sl 7 -p -t 1000 -o /forest
>
> # Test predictions
> hadoop jar $ME org.apache.mahout.classifier.df.mapreduce.TestForest -i
> /test/part-r-00000 -ds /info -m /forest -a -mr -o /predictions
>
> Summary
> -------------------------------------------------------
> Correctly Classified Instances          :    4801068       80.5992%
> Incorrectly Classified Instances        :    1155648       19.4008%
> Total Classified Instances              :    5956716
>
> =======================================================
> Confusion Matrix
> -------------------------------------------------------
> a       b       <--Classified as
> 0       1155648  |  1155648     a     = lt_yr
> 0       4801068  |  4801068     b     = gt_yr
>
>
> On Thu, Sep 19, 2013 at 9:47 AM, Andrew Musselman <
> andrew.musselman@gmail.com> wrote:
>
>> Hi group, does the label/target variable need to be binary or can it be
>> categorical with many values?
>>
>> Also, does the training set need to have equal numbers of each possible
>> label value?
>>
>> I'm finding that when I train on a binary label with many more of value *
>> b* than value *a* I don't get any predictions in the confusion matrix
>> for *a.  *Is that a known issue?
>>
>> Thanks
>> Andrew
>>
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message