mahout-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Aleksander Sadecki <aleksander.sade...@pi.esisar.grenoble-inp.fr>
Subject Apache Mahout - KMeans Clustering
Date Wed, 21 May 2014 11:19:22 GMT
Hi,

I am following the book Mahout In Action.

I downloaded sources and I am trying to run this piece of code:

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.common.distance.EuclideanDistanceMeasure;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;

public class KMeansClustering {
    public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 },
            { 2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };

    public static void writePointsToFile(List<Vector> points, String fileName,
            FileSystem fs, Configuration conf) throws IOException {
        Path path = new Path(fileName);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
                LongWritable.class, VectorWritable.class);
        long recNum = 0;
        VectorWritable vec = new VectorWritable();
        for (Vector point : points) {
            vec.set(point);
            writer.append(new LongWritable(recNum++), vec);
        }
        writer.close();
    }

    public static List<Vector> getPoints(double[][] raw) {
        List<Vector> points = new ArrayList<Vector>();
        for (int i = 0; i < raw.length; i++) {
            double[] fr = raw[i];
            Vector vec = new RandomAccessSparseVector(fr.length);
            vec.assign(fr);
            points.add(vec);
        }
        return points;
    }

    public static void main(String args[]) throws Exception {

        int k = 2;

        List<Vector> vectors = getPoints(points);

        File testData = new File("testdata");
        if (!testData.exists()) {
            testData.mkdir();
        }
        testData = new File("testdata/points");
        if (!testData.exists()) {
            testData.mkdir();
        }

        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        writePointsToFile(vectors, "testdata/points/file1", fs, conf);

        Path path = new Path("testdata/clusters/part-00000");
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
                Text.class, Cluster.class);

        for (int i = 0; i < k; i++) {
            Vector vec = vectors.get(i);
            Cluster cluster = new Cluster(vec, i,
                    new EuclideanDistanceMeasure());
            writer.append(new Text(cluster.getIdentifier()), cluster);
        }
        writer.close();

        KMeansDriver.run(conf, new Path("testdata/points"), new Path(
                "testdata/clusters"), new Path("output"),
                new EuclideanDistanceMeasure(), 0.001, 10, true, false);

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(
                "output/" + Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000"),
                conf);

        IntWritable key = new IntWritable();
        WeightedVectorWritable value = new WeightedVectorWritable();
        while (reader.next(key, value)) {
            System.out.println(value.toString() + " belongs to cluster "
                    + key.toString());
        }
        reader.close();
    }

}

In fact, I have got a problem with 2 imports:

import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.clustering.kmeans.Cluster;

and with line:

KMeansDriver.run(...)

which gives an error

The method run(Configuration, Path, Path, Path, double, int, boolean, double, boolean) in
the type KMeansDriver is not applicable for the arguments 
 (Configuration, Path, Path, Path, EuclideanDistanceMeasure, double, int, boolean, boolean)

I think I solved it a little bit. I changed

import org.apache.mahout.clustering.WeightedVectorWritable;

to

import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;

but I cannot find a solution for other ones. I found Cluster in package org.apache.mahout.clustering
but it is an Interface.

Thank you for any help.

My pom.xml

    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-core</artifactId>
        <version>0.9</version>
    </dependency>
    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-math</artifactId>
        <version>0.9</version>
    </dependency>
    <dependency>
        <groupId>org.apache.mahout</groupId>
        <artifactId>mahout-collections</artifactId>
        <version>1.0</version>
    </dependency>


Mime
View raw message