mahout-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Angelo Immediata <angelo...@gmail.com>
Subject Info about KMEans clustering
Date Thu, 28 Nov 2013 09:39:56 GMT
Hi all
I'm pretty new to mahout and I don't know if this is the right place where
to post this questions, so pardon me if I'm wrong :)
I'm using apache mahout 0.8, apache hadoop 2.2.0; I wanted to test this
class:
public class ClusterAnalysisModule {

public static final double[][] points = { { 1, 1 }, { 2, 1 }, { 1, 2 }, {
2, 2 }, { 3, 3 }, { 8, 8 }, { 9, 8 }, { 8, 9 }, { 9, 9 } };
private static final Log LOG =
LogFactory.getLog(ClusterAnalysisModule.class.getName());
private void writePointsToFile(List<Vector> points, String fileName,
FileSystem fs, Configuration conf) throws IOException {
Path path = new Path(fileName);
Option fileOption = SequenceFile.Writer.file(path);
Option keyClassOption = SequenceFile.Writer.keyClass(LongWritable.class);
Option valueClassOption =
SequenceFile.Writer.valueClass(VectorWritable.class);
SequenceFile.Writer writer = SequenceFile.createWriter(conf, fileOption,
keyClassOption, valueClassOption);
long recNum = 0;
VectorWritable vec = new VectorWritable();
for (Vector point : points) {
vec.set(point);
writer.append(new LongWritable(recNum++), vec);
}
writer.close();
}

private List<Vector> getPoints(double[][] raw) {
List<Vector> points = new ArrayList<Vector>();
for (int i = 0; i < raw.length; i++) {
double[] fr = raw[i];
Vector vec = new RandomAccessSparseVector(fr.length);
vec.assign(fr);
points.add(vec);
}
return points;
}

public void executeClusterAnalysis() throws Exception {
SequenceFile.Writer writer = null;
SequenceFile.Reader reader = null;
try{
// Numero dei cluster in uscita
int k = 2;
List<Vector> vectors = getPoints(points); // Input data
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
writePointsToFile(vectors, "/root/Scrivania/testKmean/dati/file1", fs,
conf); // Scrivo
// i
// punti
// centrali
// iniziali
Path path = new
Path("/root/Scrivania/testKmean/input/testdata/clusters/part-00000");
Option fileOption = SequenceFile.Writer.file(path);
Option keyClassOption = SequenceFile.Writer.keyClass(Text.class);
Option valueClassOption = SequenceFile.Writer.valueClass(Kluster.class);
writer = SequenceFile.createWriter(conf, fileOption, keyClassOption,
valueClassOption);
for (int i = 0; i < k; i++) {
Vector vec = vectors.get(i);
// Cluster cluster = new Canopy(vec, i, new
// EuclideanDistanceMeasure());
Kluster cluster = new Kluster(vec, i, new EuclideanDistanceMeasure());
writer.append(new Text((cluster).getIdentifier()), cluster);
}
KMeansDriver.run(conf, new Path("/root/Scrivania/testKmean/dati/"),
new Path("/root/Scrivania/testKmean/input/testdata/clusters"),
new Path("/root/Scrivania/testKmean/output"), new
EuclideanDistanceMeasure(), 0.001, 10, true, 10, false);
// Path readerPath = new Path("/root/Scrivania/testKmean/output/" +
Cluster.CLUSTERED_POINTS_DIR + "/part-m-00000");
// org.apache.hadoop.io.SequenceFile.Reader.Option optReadPath =
org.apache.hadoop.io.SequenceFile.Reader.file(readerPath);
// reader = SequenceFile.createWriter(conf, optReadPath);
reader = new SequenceFile.Reader(fs, new
Path("/root/Scrivania/testKmean/output/" + Cluster.CLUSTERED_POINTS_DIR +
"/part-m-00000"), conf);
IntWritable key = new IntWritable();
WeightedVectorWritable value = new WeightedVectorWritable();
while (reader.next(key, value)) {
LOG.info(value.toString() + " belongs to cluster " + key.toString());
}
} catch(Exception e){

LOG.fatal("Errore", e);
throw new IllegalStateException(e);
} finally {
if( writer != null ){

writer.close();
}
if( reader != null ){

reader.close();
}
}
}
}

But when I execute it I get this error:
10:39:14,083 DEBUG [MutableMetricsFactory] field
org.apache.hadoop.metrics2.lib.MutableRate
org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess
with annotation
@org.apache.hadoop.metrics2.annotation.Metric(valueName=Time, value=[Rate
of successful kerberos logins and latency (milliseconds)], about=,
type=DEFAULT, always=false, sampleName=Ops)
10:39:14,097 DEBUG [MutableMetricsFactory] field
org.apache.hadoop.metrics2.lib.MutableRate
org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure
with annotation
@org.apache.hadoop.metrics2.annotation.Metric(valueName=Time, value=[Rate
of failed kerberos logins and latency (milliseconds)], about=,
type=DEFAULT, always=false, sampleName=Ops)
10:39:14,099 DEBUG [MetricsSystemImpl] UgiMetrics, User and group related
metrics
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in
[jar:file:/root/.m2/repository/org/slf4j/slf4j-log4j12/1.7.5/slf4j-log4j12-1.7.5.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in
[jar:file:/root/.m2/repository/org/slf4j/slf4j-jcl/1.6.0/slf4j-jcl-1.6.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an
explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
10:39:14,355 DEBUG [KerberosName] Kerberos krb5 configuration not found,
setting default realm to empty
10:39:14,358 DEBUG [Groups]  Creating new Groups object
10:39:14,361 DEBUG [NativeCodeLoader] Trying to load the custom-built
native-hadoop library...
10:39:14,362 DEBUG [NativeCodeLoader] Failed to load native-hadoop with
error: java.lang.UnsatisfiedLinkError: no hadoop in java.library.path
10:39:14,362 DEBUG [NativeCodeLoader]
java.library.path=/usr/lib/jvm/jdk1.7.0_45/jre/lib/i386/server:/usr/lib/jvm/jdk1.7.0_45/jre/lib/i386:/usr/lib/jvm/jdk1.7.0_45/jre/../lib/i386:/usr/lib/jvm/jdk1.7.0_45/jre/lib/i386/client:/usr/lib/jvm/jdk1.7.0_45/jre/lib/i386::/usr/java/packages/lib/i386:/lib:/usr/lib
10:39:14,362 WARN  [NativeCodeLoader] Unable to load native-hadoop library
for your platform... using builtin-java classes where applicable
10:39:14,362 DEBUG [JniBasedUnixGroupsMappingWithFallback] Falling back to
shell based
10:39:14,363 DEBUG [JniBasedUnixGroupsMappingWithFallback] Group mapping
impl=org.apache.hadoop.security.ShellBasedUnixGroupsMapping
10:39:14,495 DEBUG [Groups] Group mapping
impl=org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback;
cacheTimeout=300000
10:39:14,516 DEBUG [UserGroupInformation] hadoop login
10:39:14,517 DEBUG [UserGroupInformation] hadoop login commit
10:39:14,521 DEBUG [UserGroupInformation] using local user:UnixPrincipal:
root
10:39:14,527 DEBUG [UserGroupInformation] UGI loginUser:root (auth:SIMPLE)
10:39:14,828 DEBUG [Shell] setsid exited with exit code 0
10:39:14,917 INFO  [CodecPool] Got brand-new compressor [.deflate]
10:39:14,945 DEBUG [CodecPool] Got recycled compressor
10:39:14,959 INFO  [KMeansDriver] Input: /root/Scrivania/testKmean/dati
Clusters In: /root/Scrivania/testKmean/input/testdata/clusters Out:
/root/Scrivania/testKmean/output Distance:
org.apache.mahout.common.distance.EuclideanDistanceMeasure
10:39:14,959 INFO  [KMeansDriver] convergence: 0.001 max Iterations: 10
10:39:14,987 WARN  [FSInputChecker] Problem opening checksum file:
file:/root/Scrivania/testKmean/input/testdata/clusters/part-00000.
 Ignoring exception:
java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:197)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at
org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.<init>(ChecksumFileSystem.java:146)
at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:339)
at org.apache.hadoop.io.SequenceFile$Reader.openFile(SequenceFile.java:1832)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1752)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1773)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.<init>(SequenceFileValueIterator.java:56)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:124)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:120)
at com.google.common.collect.Iterators$8.next(Iterators.java:812)
at com.google.common.collect.Iterators$5.hasNext(Iterators.java:544)
at
com.google.common.collect.ForwardingIterator.hasNext(ForwardingIterator.java:43)
at
org.apache.mahout.clustering.kmeans.KMeansUtil.configureWithClusterInfo(KMeansUtil.java:51)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:209)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:143)
at
it.eng.tz.pinf.clustering.ClusterAnalysisModule.executeClusterAnalysis(ClusterAnalysisModule.java:84)
at
it.eng.tz.pinf.clustering.KMeansDriverClusterTest.kmeansClusteringTest(KMeansDriverClusterTest.java:24)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47)
at
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44)
at
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229)
at org.junit.runners.ParentRunner.run(ParentRunner.java:309)
at
org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:50)
at
org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)
10:39:14,993 FATAL [ClusterAnalysisModule] Errore
java.lang.IllegalStateException:
file:/root/Scrivania/testKmean/input/testdata/clusters/part-00000
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:129)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:120)
at com.google.common.collect.Iterators$8.next(Iterators.java:812)
at com.google.common.collect.Iterators$5.hasNext(Iterators.java:544)
at
com.google.common.collect.ForwardingIterator.hasNext(ForwardingIterator.java:43)
at
org.apache.mahout.clustering.kmeans.KMeansUtil.configureWithClusterInfo(KMeansUtil.java:51)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:209)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:143)
at
it.eng.tz.pinf.clustering.ClusterAnalysisModule.executeClusterAnalysis(ClusterAnalysisModule.java:84)
at
it.eng.tz.pinf.clustering.KMeansDriverClusterTest.kmeansClusteringTest(KMeansDriverClusterTest.java:24)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47)
at
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44)
at
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229)
at org.junit.runners.ParentRunner.run(ParentRunner.java:309)
at
org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:50)
at
org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)
Caused by: java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:197)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at org.apache.hadoop.io.SequenceFile$Reader.init(SequenceFile.java:1845)
at
org.apache.hadoop.io.SequenceFile$Reader.initialize(SequenceFile.java:1810)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1759)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1773)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.<init>(SequenceFileValueIterator.java:56)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:124)
... 32 more
10:39:14,996 FATAL [KMeansDriverClusterTest] Errore durante il test del
clustering KMeans; messaggio errore: java.lang.IllegalStateException:
file:/root/Scrivania/testKmean/input/testdata/clusters/part-00000
java.lang.IllegalStateException: java.lang.IllegalStateException:
file:/root/Scrivania/testKmean/input/testdata/clusters/part-00000
at
it.eng.tz.pinf.clustering.ClusterAnalysisModule.executeClusterAnalysis(ClusterAnalysisModule.java:99)
at
it.eng.tz.pinf.clustering.KMeansDriverClusterTest.kmeansClusteringTest(KMeansDriverClusterTest.java:24)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:47)
at
org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at
org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44)
at
org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:271)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:70)
at
org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:50)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:238)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:63)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:236)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:53)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:229)
at org.junit.runners.ParentRunner.run(ParentRunner.java:309)
at
org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:50)
at
org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:467)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:683)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:390)
at
org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:197)
Caused by: java.lang.IllegalStateException:
file:/root/Scrivania/testKmean/input/testdata/clusters/part-00000
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:129)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:120)
at com.google.common.collect.Iterators$8.next(Iterators.java:812)
at com.google.common.collect.Iterators$5.hasNext(Iterators.java:544)
at
com.google.common.collect.ForwardingIterator.hasNext(ForwardingIterator.java:43)
at
org.apache.mahout.clustering.kmeans.KMeansUtil.configureWithClusterInfo(KMeansUtil.java:51)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.buildClusters(KMeansDriver.java:209)
at
org.apache.mahout.clustering.kmeans.KMeansDriver.run(KMeansDriver.java:143)
at
it.eng.tz.pinf.clustering.ClusterAnalysisModule.executeClusterAnalysis(ClusterAnalysisModule.java:84)
... 24 more
Caused by: java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:197)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at org.apache.hadoop.io.SequenceFile$Reader.init(SequenceFile.java:1845)
at
org.apache.hadoop.io.SequenceFile$Reader.initialize(SequenceFile.java:1810)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1759)
at org.apache.hadoop.io.SequenceFile$Reader.<init>(SequenceFile.java:1773)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.<init>(SequenceFileValueIterator.java:56)
at
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator$1.apply(SequenceFileDirValueIterator.java:124)
... 32 more

May anybody tell me where I'm wrong?

Thank you
Angelo

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message