spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "vivek.ys" <vivek...@gmail.com>
Subject Broadcst RDD Lookup
Date Thu, 01 May 2014 08:36:52 GMT
Hi All,
    I am facing an issue while performing the lookup. Please guide me on
where the mistake is.

val userCluster = sc.textFile("/vives/cluster2/day/users").map(_ match {
    case line : String => (line.split(',')(1).split(')')(0).trim.toInt,
line.split(',')(0).split('(')(1).toInt)
  })

val userClusterBroadCast = sc.broadcast(userCluster)

val productCluster = sc.textFile("/vives/cluster2/day/sites").map(_ match {
    case line : String => (line.split(',')(1).split(')')(0).trim.toInt,
line.split(',')(0).split('(')(1).toInt)
  })

val productClusterBroadCast = sc.broadcast(productCluster)

  val nCut = data.map(_.split('\t') match {
    case Array(user, item, rate) =>
((userClusterBroadCast.value.lookup(user.toInt)(0),
      productClusterBroadCast.value.lookup(item.toInt)(0)), (user.toInt,
item.toInt, rate.toDouble))
  }).persist(StorageLevel.MEMORY_AND_DISK)

When I try this I get scala match error null.

scala.MatchError: null
	at org.apache.spark.rdd.PairRDDFunctions.lookup(PairRDDFunctions.scala:550)
	at
$line27.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:34)
	at
$line27.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:33)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
	at scala.collection.Iterator$class.foreach(Iterator.scala:727)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
	at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
	at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:75)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
	at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:161)
	at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:102)
	at org.apache.spark.scheduler.Task.run(Task.scala:53)
	at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$run$1.apply$mcV$sp(Executor.scala:213)
	at
org.apache.spark.deploy.SparkHadoopUtil.runAsUser(SparkHadoopUtil.scala:49)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:178)
	at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
	at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:744)




--
View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Broadcst-RDD-Lookup-tp5142.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

Mime
View raw message