spark-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Alexander Pivovarov <apivova...@gmail.com>
Subject Re: Spark fails after 6000s because of akka
Date Sun, 20 Dec 2015 19:28:44 GMT
it can also fail with the following message

Exception in thread "main" org.apache.spark.SparkException: Job
aborted due to stage failure: Task 133 in stage 33.1 failed 4 times,
most recent failure: Lost task 133.3 in stage 33.1 (TID 172737,
ip-10-0-25-2.ec2.internal): java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
	at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
	at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
	at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
	at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
	at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
	... 1 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1283)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1271)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1270)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1270)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:697)
	at scala.Option.foreach(Option.scala:236)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:697)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1496)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1458)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1447)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1824)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1837)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1914)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1430)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1409)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:310)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1409)
	at com.radius.distiller.components.CondenseRecords.saveValidationQa(CondenseRecords.scala:65)
	at com.radius.distiller.Distiller.runCondenseRecords(Distiller.scala:49)
	at com.radius.distiller.Execute$.run(Execute.scala:56)
	at com.radius.distiller.Execute$.main(Execute.scala:33)
	at com.radius.distiller.Execute.main(Execute.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:606)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:674)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Failed to connect to
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:193)
	at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:156)
	at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:88)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
	at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask.run(FutureTask.java:262)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.net.ConnectException: Connection refused:
ip-10-0-25-2.ec2.internal/10.0.25.2:48048
	at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
	at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)
	at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
	at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
	at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
	at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
	at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
	at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
	... 1 more


On Sun, Dec 20, 2015 at 10:42 AM, Alexander Pivovarov <apivovarov@gmail.com>
wrote:

> I run Spark 1.5.2 on YARN (EMR)
>
> I noticed that my long running jobs always failed after 1h 40 min  (6000s)
> with the exceptions below.
>
> Then I found that Spark has spark.akka.heartbeat.pauses=6000s by default
>
> I changed the settings to the following and it solve my issue.
>
> "spark.akka.heartbeat.pauses": "60000s",
> "spark.akka.heartbeat.interval": "10000s"
>
>
>
> RROR ErrorMonitor - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6]
shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> 	at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
> 	at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
> 	at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> 	at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> 	at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> 	at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> 	at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> 	at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> 	at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> 	at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> 	at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> 	at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> 	at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> 	at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 	at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 	at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 	at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6]
shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> 	at com.google.protobuf.ByteString.copyFrom(ByteString.java:192)
> 	at com.google.protobuf.ByteString.copyFrom(ByteString.java:204)
> 	at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> 	at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> 	at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> 	at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> 	at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> 	at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> 	at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> 	at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> 	at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> 	at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> 	at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> 	at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 	at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 	at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 	at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-5]
shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> 	at java.util.Arrays.copyOf(Arrays.java:2271)
> 	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
> 	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> 	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> 	at java.io.ObjectOutputStream$BlockDataOutputStream.drain(ObjectOutputStream.java:1876)
> 	at java.io.ObjectOutputStream$BlockDataOutputStream.setBlockDataMode(ObjectOutputStream.java:1785)
> 	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1188)
> 	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:347)
> 	at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply$mcV$sp(Serializer.scala:129)
> 	at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
> 	at akka.serialization.JavaSerializer$$anonfun$toBinary$1.apply(Serializer.scala:129)
> 	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> 	at akka.serialization.JavaSerializer.toBinary(Serializer.scala:129)
> 	at akka.remote.MessageSerializer$.serialize(MessageSerializer.scala:36)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at akka.remote.EndpointWriter$$anonfun$serializeMessage$1.apply(Endpoint.scala:843)
> 	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
> 	at akka.remote.EndpointWriter.serializeMessage(Endpoint.scala:842)
> 	at akka.remote.EndpointWriter.writeSend(Endpoint.scala:743)
> 	at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> 	at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> 	at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> 	at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> 	at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> 	at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> 	at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> 	at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> 	at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 	at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 	at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 	at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> ERROR ActorSystemImpl - Uncaught fatal error from thread [sparkDriver-akka.remote.default-remote-dispatcher-6]
shutting down ActorSystem [sparkDriver]
> java.lang.OutOfMemoryError: Java heap space
> 	at com.google.protobuf.AbstractMessageLite.toByteArray(AbstractMessageLite.java:62)
> 	at akka.remote.transport.AkkaPduProtobufCodec$.constructMessage(AkkaPduCodec.scala:138)
> 	at akka.remote.EndpointWriter.writeSend(Endpoint.scala:740)
> 	at akka.remote.EndpointWriter$$anonfun$2.applyOrElse(Endpoint.scala:718)
> 	at akka.actor.Actor$class.aroundReceive(Actor.scala:467)
> 	at akka.remote.EndpointActor.aroundReceive(Endpoint.scala:411)
> 	at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516)
> 	at akka.actor.ActorCell.invoke(ActorCell.scala:487)
> 	at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:238)
> 	at akka.dispatch.Mailbox.run(Mailbox.scala:220)
> 	at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:397)
> 	at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 	at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 	at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
>
> at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>
>

Mime
View raw message