spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Xiangrui Meng <men...@gmail.com>
Subject Re: Stack overflow Error while executing spark SQL
Date Mon, 15 Dec 2014 19:57:56 GMT
Could you post the full stacktrace? It seems to be some recursive call
in parsing. -Xiangrui

On Tue, Dec 9, 2014 at 7:44 PM,  <jishnu.prathap@wipro.com> wrote:
> Hi
>
>
>
> I am getting Stack overflow Error
>
> Exception in main java.lang.stackoverflowerror
>
> scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)
>
>        at
> scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)
>
>        at
> scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)
>
>        at
> scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)
>
>        at
> scala.util.parsing.combinator.Parsers$Parser$$anonfun$append$1.apply(Parsers.scala:254)
>
>        at
> scala.util.parsing.combinator.Parsers$$anon$3.apply(Parsers.scala:222)
>
>
> while executing the following code
>
> sqlContext.sql("SELECT text FROM tweetTable LIMIT
> 10").collect().foreach(println)
>
>
>
> The complete code is from github
>
> https://github.com/databricks/reference-apps/blob/master/twitter_classifier/scala/src/main/scala/com/databricks/apps/twitter_classifier/ExamineAndTrain.scala
>
>
>
> import com.google.gson.{GsonBuilder, JsonParser}
>
> import org.apache.spark.mllib.clustering.KMeans
>
> import org.apache.spark.sql.SQLContext
>
> import org.apache.spark.{SparkConf, SparkContext}
>
> import org.apache.spark.mllib.clustering.KMeans
>
> /**
>
> * Examine the collected tweets and trains a model based on them.
>
> */
>
> object ExamineAndTrain {
>
> val jsonParser = new JsonParser()
>
> val gson = new GsonBuilder().setPrettyPrinting().create()
>
> def main(args: Array[String]) {
>
> // Process program arguments and set properties
>
> /*if (args.length < 3) {
>
> System.err.println("Usage: " + this.getClass.getSimpleName +
>
> " <tweetInput> <outputModelDir> <numClusters> <numIterations>")
>
> System.exit(1)
>
> }
>
> *
>
> */
>
>    val outputModelDir="C:\\MLModel"
>
>      val tweetInput="C:\\MLInput"
>
>        val numClusters=10
>
>        val numIterations=20
>
>
>
> //val Array(tweetInput, outputModelDir, Utils.IntParam(numClusters),
> Utils.IntParam(numIterations)) = args
>
>
>
> val conf = new
> SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[4]")
>
> val sc = new SparkContext(conf)
>
> val sqlContext = new SQLContext(sc)
>
> // Pretty print some of the tweets.
>
> val tweets = sc.textFile(tweetInput)
>
> println("------------Sample JSON Tweets-------")
>
> for (tweet <- tweets.take(5)) {
>
> println(gson.toJson(jsonParser.parse(tweet)))
>
> }
>
> val tweetTable = sqlContext.jsonFile(tweetInput).cache()
>
> tweetTable.registerTempTable("tweetTable")
>
> println("------Tweet table Schema---")
>
> tweetTable.printSchema()
>
> println("----Sample Tweet Text-----")
>
>
>
> sqlContext.sql("SELECT text FROM tweetTable LIMIT
> 10").collect().foreach(println)
>
>
>
>
>
>
>
> println("------Sample Lang, Name, text---")
>
> sqlContext.sql("SELECT user.lang, user.name, text FROM tweetTable LIMIT
> 1000").collect().foreach(println)
>
> println("------Total count by languages Lang, count(*)---")
>
> sqlContext.sql("SELECT user.lang, COUNT(*) as cnt FROM tweetTable GROUP BY
> user.lang ORDER BY cnt DESC LIMIT 25").collect.foreach(println)
>
> println("--- Training the model and persist it")
>
> val texts = sqlContext.sql("SELECT text from
> tweetTable").map(_.head.toString)
>
> // Cache the vectors RDD since it will be used for all the KMeans
> iterations.
>
> val vectors = texts.map(Utils.featurize).cache()
>
> vectors.count() // Calls an action on the RDD to populate the vectors cache.
>
> val model = KMeans.train(vectors, numClusters, numIterations)
>
> sc.makeRDD(model.clusterCenters,
> numClusters).saveAsObjectFile(outputModelDir)
>
> val some_tweets = texts.take(100)
>
> println("----Example tweets from the clusters")
>
> for (i <- 0 until numClusters) {
>
> println(s"\nCLUSTER $i:")
>
> some_tweets.foreach { t =>
>
> if (model.predict(Utils.featurize(t)) == i) {
>
> println(t)
>
> }
>
> }
>
> }
>
> }
>
> }
>
>
>
> Thanks & Regards
>
> Jishnu Menath Prathap
>
>
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscribe@spark.apache.org
For additional commands, e-mail: user-help@spark.apache.org


Mime
View raw message