spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Mikhailau, Alex" <Alex.Mikhai...@mlb.com>
Subject Spark 2.1.1 with Kinesis Receivers is failing to launch 50 active receivers with oversized cluster on EMR Yarn
Date Tue, 05 Sep 2017 19:39:19 GMT
Guys,

I have a Spark 2.1.1 job with Kinesis where it is failing to launch 50 active receivers with
oversized cluster on EMR Yarn. It registers sometimes 16, sometimes 32, other times 48 receivers
but not all 50.  Any help would be greatly appreciated.

Kinesis stream shards = 500

YARN EMR CLuster

Master m4.4xlarge 1
Core m4.2xlarge 15
Task m4.2xlarge 20

Spark Submit:

/usr/lib/spark/bin/spark-submit,--deploy-mode,cluster,--master,yarn,--conf,spark.streaming.stopGracefullyOnShutdown=true,--conf,spark.locality.wait=7500ms,--conf,spark.streaming.blockInterval=10000ms,--conf,spark.shuffle.consolidateFiles=true,--conf,spark.serializer=org.apache.spark.serializer.KryoSerializer,--conf,spark.closure.serializer=org.apache.spark.serializer.KryoSerializer,--conf,spark.dynamicAllocation.enabled=true,--conf,spark.scheduler.mode=FIFO,--conf,spark.ui.retainedJobs=50,--conf,spark.ui.retainedStages=50,--conf,spark.ui.retainedTasks=500,--conf,spark.worker.ui.retainedExecutors=50,--conf,spark.worker.ui.retainedDrivers=50,--conf,spark.sql.ui.retainedExecutions=50,--conf,spark.streaming.ui.retainedBatches=50,--conf,'spark.executor.extraJavaOptions=-XX:+AlwaysPreTouch
-XX:MaxPermSize=6G',--conf,spark.rdd.compress=true,--conf,spark.yarn.executor.memoryOverhead=5120,--executor-memory,15G,--conf,spark.task.maxFailures=8,--conf,spark.yarn.maxAppAttempts=4,--conf,'spark.yarn.max.executor.failures=200',--conf,spark.yarn.executor.failuresValidityInterval=1h,--conf,spark.yarn.am.attemptFailuresValidityInterval=1h,--conf,spark.speculation=false,--driver-java-options,'-XX:+AlwaysPreTouch
-XX:MaxPermSize=6G',--conf,spark.metrics.namespace=$env.$namespace.skynet.stream-concurrency,--class,com.mlbam.emr.StreamingJob,s3://s3jobsbucket/jars/
spark-assembly-${VERSION}.jar,--env,$env,--checkpoint-location,"hdfs:///var/log/spark/apps/checkpoints/app-$env",ActionOnFailure=CONTINUE

My Environment:

Java Home

/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.141-1.b16.32.amzn1.x86_64/jre

Java Version

1.8.0_141 (Oracle Corporation)

Scala Version

version 2.11.8

Spark Properties
Name

Value

spark.app.id

application_1504636247367_0007

spark.app.name

skynet-stream-concurrency-qa

spark.closure.serializer

org.apache.spark.serializer.KryoSerializer

spark.default.parallelism

800

spark.driver.extraClassPath

jsonevent-layout-1.7.jar:json-smart-1.1.1.jar:/home/hadoop/lib/*:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*

spark.driver.extraJavaOptions

-XX:+AlwaysPreTouch -XX:MaxPermSize=6G

spark.driver.extraLibraryPath

/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native

spark.driver.host

10.202.138.242

spark.driver.memory

22342M

spark.driver.port

38634

spark.dynamicAllocation.enabled

true

spark.dynamicAllocation.executorIdleTimeout

10m

spark.eventLog.dir

hdfs:///var/log/spark/apps

spark.eventLog.enabled

true

spark.executor.cores

16

spark.executor.extraClassPath

jsonevent-layout-1.7.jar:json-smart-1.1.1.jar:/home/hadoop/lib/*:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*

spark.executor.extraJavaOptions

-XX:+AlwaysPreTouch -XX:MaxPermSize=6G

spark.executor.extraLibraryPath

/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native

spark.executor.id

driver

spark.executor.memory

15G

spark.hadoop.yarn.timeline-service.enabled

false

spark.history.fs.logDirectory

hdfs:///var/log/spark/apps

spark.history.ui.port

18080

spark.kryo.classesToRegister

com.mlbam.emr.UserSessions,com.mlbam.emr.StreamSampleEvent

spark.locality.wait

7500ms

spark.master

yarn

spark.metrics.namespace

qa.mlbam.skynet.stream-concurrency

spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS

ip-10-202-138-87.mlbam.qa.us-east-1.bamgrid.net

spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES

http://ip-10-202-138-87.mlbam.qa.us-east-1.bamgrid.net:20888/proxy/application_1504636247367_0007

spark.rdd.compress

true

spark.scheduler.mode

FIFO

spark.serializer

org.apache.spark.serializer.KryoSerializer

spark.shuffle.consolidateFiles

true

spark.shuffle.service.enabled

true

spark.speculation

false

spark.sql.hive.metastore.sharedPrefixes

com.amazonaws.services.dynamodbv2

spark.sql.ui.retainedExecutions

50

spark.sql.warehouse.dir

hdfs:///user/spark/warehouse

spark.streaming.backpressure.enabled

true

spark.streaming.blockInterval

10000ms

spark.streaming.stopGracefullyOnShutdown

true

spark.streaming.ui.retainedBatches

50

spark.submit.deployMode

cluster

spark.task.maxFailures

8

spark.ui.filters

org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter

spark.ui.port

0

spark.ui.retainedJobs

50

spark.ui.retainedStages

50

spark.ui.retainedTasks

500

spark.worker.ui.retainedDrivers

50

spark.worker.ui.retainedExecutors

50

spark.yarn.am.attemptFailuresValidityInterval

1h

spark.yarn.app.container.log.dir

/var/log/hadoop-yarn/containers/application_1504636247367_0007/container_1504636247367_0007_01_000001

spark.yarn.app.id

application_1504636247367_0007

spark.yarn.executor.failuresValidityInterval

1h

spark.yarn.executor.memoryOverhead

5120

spark.yarn.historyServer.address

ip-10-202-138-87.mlbam.qa.us-east-1.bamgrid.net:18080

spark.yarn.max.executor.failures

200

spark.yarn.maxAppAttempts

4





Mime
View raw message