mahout-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Yang <teddyyyy...@gmail.com>
Subject Re: SSVD Bt-Job mappers all stop at 66.70% ??
Date Wed, 15 Oct 2014 22:49:34 GMT
I did a jstack


and found all the threads for 1 mapper are stuck, I guess the other mappers
are like this too


yyang15@yyang15-VirtualBox:~/work/CIReco/title_flow/java_code$ cat
/tmp/yyang
Deadlock Detection:

No deadlocks found.

Thread 2381: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.ipc.Client$Connection.waitForWork() @bci=59, line=903
(Interpreted frame)
 - org.apache.hadoop.ipc.Client$Connection.run() @bci=55, line=948
(Interpreted frame)


Thread 10422: (state = IN_NATIVE)
 - sun.nio.ch.EPollArrayWrapper.epollWait(long, int, long, int) @bci=0
(Interpreted frame)
 - sun.nio.ch.EPollArrayWrapper.poll(long) @bci=18, line=269 (Interpreted
frame)
 - sun.nio.ch.EPollSelectorImpl.doSelect(long) @bci=28, line=79
(Interpreted frame)
 - sun.nio.ch.SelectorImpl.lockAndDoSelect(long) @bci=37, line=87
(Interpreted frame)
 - sun.nio.ch.SelectorImpl.select(long) @bci=30, line=98 (Interpreted frame)
 -
org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(java.nio.channels.SelectableChannel,
int, long) @bci=46, line=335 (Interpreted frame)
 - org.apache.hadoop.net.SocketIOWithTimeout.doIO(java.nio.ByteBuffer, int)
@bci=80, line=157 (Interpreted frame)
 - org.apache.hadoop.net.SocketInputStream.read(java.nio.ByteBuffer)
@bci=6, line=161 (Interpreted frame)
 - org.apache.hadoop.net.SocketInputStream.read(byte[], int, int) @bci=7,
line=131 (Interpreted frame)
 - org.apache.hadoop.net.SocketInputStream.read() @bci=8, line=118
(Interpreted frame)
 - java.io.FilterInputStream.read() @bci=4, line=83 (Interpreted frame)
 - java.io.FilterInputStream.read() @bci=4, line=83 (Interpreted frame)
 -
org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed(java.io.InputStream)
@bci=1, line=1988 (Interpreted frame)
 -
org.apache.hadoop.hdfs.protocol.datatransfer.PipelineAck.readFields(java.io.InputStream)
@bci=2, line=176 (Interpreted frame)
 -
org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer$ResponseProcessor.run()
@bci=75, line=796 (Interpreted frame)


Thread 10407: (state = BLOCKED)
 - java.lang.Thread.sleep(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.hdfs.LeaseRenewer.run(int) @bci=429, line=438
(Interpreted frame)
 -
org.apache.hadoop.hdfs.LeaseRenewer.access$700(org.apache.hadoop.hdfs.LeaseRenewer,
int) @bci=2, line=71 (Interpreted frame)
 - org.apache.hadoop.hdfs.LeaseRenewer$1.run() @bci=69, line=298
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 10406: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run() @bci=265,
line=502 (Interpreted frame)


Thread 6486: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame)
 - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14,
line=186 (Interpreted frame)
 -
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await()
@bci=42, line=2043 (Interpreted frame)
 - org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run()
@bci=47, line=1501 (Interpreted frame)


Thread 6453: (state = BLOCKED)
 - java.lang.Thread.sleep(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.hdfs.PeerCache.run() @bci=41, line=245 (Interpreted
frame)
 -
org.apache.hadoop.hdfs.PeerCache.access$000(org.apache.hadoop.hdfs.PeerCache)
@bci=1, line=41 (Interpreted frame)
 - org.apache.hadoop.hdfs.PeerCache$1.run() @bci=4, line=119 (Interpreted
frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 6423: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.mapred.Task$TaskReporter.run() @bci=86, line=719
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 6422: (state = IN_NATIVE)
 - org.apache.hadoop.net.unix.DomainSocketWatcher.doPoll0(int,
org.apache.hadoop.net.unix.DomainSocketWatcher$FdSet) @bci=0 (Interpreted
frame)
 - org.apache.hadoop.net.unix.DomainSocketWatcher.access$800(int,
org.apache.hadoop.net.unix.DomainSocketWatcher$FdSet) @bci=2, line=52
(Interpreted frame)
 - org.apache.hadoop.net.unix.DomainSocketWatcher$1.run() @bci=551,
line=457 (Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 6421: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame)
 - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long)
@bci=20, line=226 (Interpreted frame)
 -
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(long)
@bci=68, line=2082 (Interpreted frame)
 - java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take()
@bci=122, line=1090 (Interpreted frame)
 - java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take()
@bci=1, line=807 (Interpreted frame)
 - java.util.concurrent.ThreadPoolExecutor.getTask() @bci=156, line=1068
(Interpreted frame)
 -
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
@bci=26, line=1130 (Interpreted frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 6414: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Interpreted frame)
 - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long)
@bci=20, line=226 (Compiled frame)
 -
java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(java.util.concurrent.SynchronousQueue$TransferStack$SNode,
boolean, long) @bci=174, line=460 (Compiled frame)
 -
java.util.concurrent.SynchronousQueue$TransferStack.transfer(java.lang.Object,
boolean, long) @bci=102, line=359 (Interpreted frame)
 - java.util.concurrent.SynchronousQueue.poll(long,
java.util.concurrent.TimeUnit) @bci=11, line=942 (Interpreted frame)
 - java.util.concurrent.ThreadPoolExecutor.getTask() @bci=141, line=1068
(Interpreted frame)
 -
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
@bci=26, line=1130 (Interpreted frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)


Thread 6413: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - org.apache.hadoop.ipc.Client$Connection.waitForWork() @bci=59, line=903
(Interpreted frame)
 - org.apache.hadoop.ipc.Client$Connection.run() @bci=55, line=948
(Interpreted frame)


Thread 6408: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - java.util.TimerThread.mainLoop() @bci=201, line=552 (Interpreted frame)
 - java.util.TimerThread.run() @bci=1, line=505 (Interpreted frame)


Thread 6406: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - java.lang.Object.wait() @bci=2, line=503 (Interpreted frame)
 - org.apache.hadoop.metrics2.impl.SinkQueue.waitForData() @bci=13,
line=114 (Interpreted frame)
 -
org.apache.hadoop.metrics2.impl.SinkQueue.consumeAll(org.apache.hadoop.metrics2.impl.SinkQueue$Consumer)
@bci=1, line=83 (Interpreted frame)
 -
org.apache.hadoop.metrics2.impl.MetricsSinkAdapter.publishMetricsFromQueue()
@bci=46, line=127 (Interpreted frame)
 - org.apache.hadoop.metrics2.impl.MetricsSinkAdapter$1.run() @bci=4,
line=86 (Interpreted frame)


Thread 6394: (state = BLOCKED)


Thread 6393: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - java.lang.ref.ReferenceQueue.remove(long) @bci=44, line=135 (Interpreted
frame)
 - java.lang.ref.ReferenceQueue.remove() @bci=2, line=151 (Interpreted
frame)
 - java.lang.ref.Finalizer$FinalizerThread.run() @bci=36, line=209
(Interpreted frame)


Thread 6392: (state = BLOCKED)
 - java.lang.Object.wait(long) @bci=0 (Interpreted frame)
 - java.lang.Object.wait() @bci=2, line=503 (Interpreted frame)
 - java.lang.ref.Reference$ReferenceHandler.run() @bci=46, line=133
(Interpreted frame)


Thread 6372: (state = IN_NATIVE)
 - sun.nio.ch.EPollArrayWrapper.epollWait(long, int, long, int) @bci=0
(Interpreted frame)
 - sun.nio.ch.EPollArrayWrapper.poll(long) @bci=18, line=269 (Interpreted
frame)
 - sun.nio.ch.EPollSelectorImpl.doSelect(long) @bci=28, line=79
(Interpreted frame)
 - sun.nio.ch.SelectorImpl.lockAndDoSelect(long) @bci=37, line=87
(Interpreted frame)
 - sun.nio.ch.SelectorImpl.select(long) @bci=30, line=98 (Interpreted frame)
 -
org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(java.nio.channels.SelectableChannel,
int, long) @bci=46, line=335 (Interpreted frame)
 - org.apache.hadoop.net.SocketIOWithTimeout.doIO(java.nio.ByteBuffer, int)
@bci=80, line=157 (Interpreted frame)
 - org.apache.hadoop.net.SocketInputStream.read(java.nio.ByteBuffer)
@bci=6, line=161 (Interpreted frame)
 -
org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.readChannelFully(java.nio.channels.ReadableByteChannel,
java.nio.ByteBuffer) @bci=9, line=258 (Interpreted frame)
 -
org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doReadFully(java.nio.channels.ReadableByteChannel,
java.io.InputStream, java.nio.ByteBuffer) @bci=6, line=209 (Interpreted
frame)
 -
org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.doRead(java.nio.channels.ReadableByteChannel,
java.io.InputStream) @bci=293, line=171 (Interpreted frame)
 -
org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver.receiveNextPacket(java.nio.channels.ReadableByteChannel)
@bci=3, line=102 (Interpreted frame)
 - org.apache.hadoop.hdfs.RemoteBlockReader2.readNextPacket() @bci=8,
line=173 (Interpreted frame)
 - org.apache.hadoop.hdfs.RemoteBlockReader2.read(byte[], int, int)
@bci=27, line=138 (Interpreted frame)
 -
org.apache.hadoop.hdfs.DFSInputStream$ByteArrayStrategy.doRead(org.apache.hadoop.hdfs.BlockReader,
int, int, org.apache.hadoop.hdfs.DFSInputStream$ReadStatistics) @bci=7,
line=683 (Interpreted frame)
 -
org.apache.hadoop.hdfs.DFSInputStream.readBuffer(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy,
int, int, java.util.Map) @bci=14, line=739 (Interpreted frame)
 -
org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(org.apache.hadoop.hdfs.DFSInputStream$ReaderStrategy,
int, int) @bci=141, line=796 (Interpreted frame)
 - org.apache.hadoop.hdfs.DFSInputStream.read(byte[], int, int) @bci=15,
line=837 (Interpreted frame)
 - java.io.DataInputStream.readFully(byte[], int, int) @bci=34, line=195
(Compiled frame)
 - org.apache.hadoop.io.DataOutputBuffer$Buffer.write(java.io.DataInput,
int) @bci=62, line=70 (Interpreted frame)
 - org.apache.hadoop.io.DataOutputBuffer.write(java.io.DataInput, int)
@bci=6, line=120 (Interpreted frame)
 -
org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.DataOutputBuffer)
@bci=43, line=2358 (Interpreted frame)
 -
org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.Writable)
@bci=77, line=2257 (Interpreted frame)
 -
org.apache.hadoop.io.SequenceFile$Reader.next(org.apache.hadoop.io.Writable,
org.apache.hadoop.io.Writable) @bci=52, line=2303 (Interpreted frame)
 -
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.computeNext()
@bci=44, line=81 (Interpreted frame)
 -
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterator.computeNext()
@bci=1, line=37 (Interpreted frame)
 - com.google.common.collect.AbstractIterator.tryToComputeNext() @bci=9,
line=143 (Interpreted frame)
 - com.google.common.collect.AbstractIterator.hasNext() @bci=61, line=138
(Interpreted frame)
 - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.loadNextQt()
@bci=4, line=86 (Interpreted frame)
 - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.hasNext()
@bci=36, line=112 (Compiled frame)
 - org.apache.mahout.math.hadoop.stochasticsvd.qr.QRLastStep.next()
@bci=16, line=123 (Compiled frame)
 -
org.apache.mahout.math.hadoop.stochasticsvd.BtJob$BtMapper.map(org.apache.hadoop.io.Writable,
org.apache.mahout.math.VectorWritable,
org.apache.hadoop.mapreduce.Mapper$Context) @bci=15, line=134 (Compiled
frame)
 -
org.apache.mahout.math.hadoop.stochasticsvd.BtJob$BtMapper.map(java.lang.Object,
java.lang.Object, org.apache.hadoop.mapreduce.Mapper$Context) @bci=10,
line=103 (Compiled frame)
 -
org.apache.hadoop.mapreduce.Mapper.run(org.apache.hadoop.mapreduce.Mapper$Context)
@bci=22, line=145 (Compiled frame)
 -
org.apache.hadoop.mapred.MapTask.runNewMapper(org.apache.hadoop.mapred.JobConf,
org.apache.hadoop.mapreduce.split.JobSplit$TaskSplitIndex,
org.apache.hadoop.mapred.TaskUmbilicalProtocol,
org.apache.hadoop.mapred.Task$TaskReporter) @bci=228, line=764 (Interpreted
frame)
 - org.apache.hadoop.mapred.MapTask.run(org.apache.hadoop.mapred.JobConf,
org.apache.hadoop.mapred.TaskUmbilicalProtocol) @bci=148, line=340
(Interpreted frame)
 - org.apache.hadoop.mapred.YarnChild$2.run() @bci=29, line=167
(Interpreted frame)
 -
java.security.AccessController.doPrivileged(java.security.PrivilegedExceptionAction,
java.security.AccessControlContext) @bci=0 (Interpreted frame)
 - javax.security.auth.Subject.doAs(javax.security.auth.Subject,
java.security.PrivilegedExceptionAction) @bci=42, line=415 (Interpreted
frame)
 -
org.apache.hadoop.security.UserGroupInformation.doAs(java.security.PrivilegedExceptionAction)
@bci=14, line=1650 (Interpreted frame)
 - org.apache.hadoop.mapred.YarnChild.main(java.lang.String[]) @bci=514,
line=162 (Interpreted frame)




sounds like some bug ?

On Wed, Oct 15, 2014 at 3:24 PM, Yang <teddyyyy123@gmail.com> wrote:

>
>
>
> attempt_1413267265041_14045_m_000008_166.70RUNNINGmap > map
> phxaishdc9dn1896.stratus.phx.ebay.com:50060logs
> <https://phxaishdc9dn1896.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000113/yyang15>Wed,
> 15 Oct 2014 20:54:44 GMTN/A1hrs, 27mins, 29sec
> attempt_1413267265041_14045_m_000031_066.70RUNNINGmap > map
> phxaishdc9dn1440.stratus.phx.ebay.com:50060logs
> <https://phxaishdc9dn1440.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000075/yyang15>Wed,
> 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 53sec
> attempt_1413267265041_14045_m_000036_066.70RUNNINGmap > map
> phxaishdc9dn0440.phx.ebay.com:50060logs
> <https://phxaishdc9dn0440.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000079/yyang15>Wed,
> 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 53sec
> attempt_1413267265041_14045_m_000070_066.70RUNNINGmap > map
> phxaishdc9dn1137.stratus.phx.ebay.com:50060logs
> <https://phxaishdc9dn1137.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000081/yyang15>Wed,
> 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 54sec
> attempt_1413267265041_14045_m_000018_066.70RUNNINGmap > map
> phxaishdc9dn1278.stratus.phx.ebay.com:50060
>
>
>
>
> logs
> <https://phxaishdc9dn1278.stratus.phx.ebay.com:50060/node/containerlogs/container_1413267265041_14045_01_000012/yyang15>Wed,
> 15 Oct 2014 15:49:19 GMTN/A6hrs, 32mins, 54sec
>
>
>
>
>
> the logs show
>
> 2014-10-15 13:55:01,130 INFO [main] org.apache.hadoop.mapred.MapTask: kvstart = 268435452;
length = 67108864
> 2014-10-15 13:55:01,278 INFO [main] org.apache.hadoop.io.compress.zlib.ZlibFactory: Successfully
loaded & initialized native-zlib library
> 2014-10-15 13:55:01,279 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.gz]
> 2014-10-15 13:55:01,288 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.gz]
> 2014-10-15 13:55:01,289 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.gz]
> 2014-10-15 13:55:01,289 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.gz]
> 2014-10-15 13:55:01,389 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.deflate]
> 2014-10-15 13:55:01,501 INFO [main] org.apache.hadoop.conf.Configuration.deprecation:
fs.default.name is deprecated. Instead, use fs.defaultFS
> 2014-10-15 13:55:01,557 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
decompressor [.deflate]
> 2014-10-15 13:57:54,066 INFO [main] org.apache.hadoop.ipc.Client: Retrying connect to
server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 0 time(s); maxRetries=45
> 2014-10-15 13:58:14,086 INFO [main] org.apache.hadoop.ipc.Client: Retrying connect to
server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 1 time(s); maxRetries=45
> 2014-10-15 13:58:14,123 INFO [main] org.apache.hadoop.io.compress.CodecPool: Got brand-new
compressor [.deflate]
> 2014-10-15 14:23:59,883 INFO [LeaseRenewer:yyang15@apollo-phx-nn.vip.ebay.com:8020] org.apache.hadoop.ipc.Client:
Retrying connect to server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 0
time(s); maxRetries=45
> 2014-10-15 14:24:19,903 INFO [LeaseRenewer:yyang15@apollo-phx-nn.vip.ebay.com:8020] org.apache.hadoop.ipc.Client:
Retrying connect to server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 1
time(s); maxRetries=45
> 2014-10-15 14:24:39,924 INFO [LeaseRenewer:yyang15@apollo-phx-nn.vip.ebay.com:8020] org.apache.hadoop.ipc.Client:
Retrying connect to server: apollo-phx-nn.vip.ebay.com/10.115.201.75:8020. Already tried 2
time(s); maxRetries=45
>
>
>
> I actually killed on of the attempts and it restarted, but again froze at 66.70%
>
>

Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message