spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Chris Olivier <cjolivie...@apache.org>
Subject StackOverflowError for simple map (not to incubator mailing list)
Date Thu, 01 Nov 2018 20:17:57 GMT
(Sorry, first one sent to incubator maling list which probably doesn't come
here)

Hi, I have been stuck at this for a week.

I have a relatively simple dataframe like this:

+---------+---------+--------------------+-------------------+
|     item|  item_id|              target|              start|
+---------+---------+--------------------+-------------------+
|sensor123|sensor123|[0.005683, 0.0070...|2008-01-01 00:00:00|
|sensor249|sensor249|[0.009783, 0.0068...|2008-01-01 00:00:00|
|sensor379|sensor379|[0.001917, 0.0016...|2008-01-01 00:00:00|
| sensor52| sensor52|[0.016267, 0.0121...|2008-01-01 00:00:00|

target us a WrappedArray[Double]

This simple code runs on local spark but has stack overflow error on
EMR Spark.  I've tried playing with paritioning with no effect.

```scala
def transform(dataset: Dataset[_]): DataFrame = {
    var ds:Dataset[_] = dataset.
    var df:DataFrame = ds.toDF

    val sparkSession = dataset.sparkSession
    import sparkSession.implicits._

    val itemIndex:Int =
SparkJavaUtils.getColumnIndex(DataSources.FIELD_ITEM, df)
    val startIndex:Int =
SparkJavaUtils.getColumnIndex(DataSources.FIELD_START, df)
    val targetIndex:Int =
SparkJavaUtils.getColumnIndex(DataSources.FIELD_TARGET, df)

    val result = df.map(r => {
        val itemName = r.getAs[String](itemIndex)
        val start = r.getAs[Timestamp](startIndex)
        val targetArray = r.getAs[mutable.WrappedArray[Double]](targetIndex)
        (itemName, start, targetArray)
    })
    result.show

    // Get the schema ready
    val schema = StructType(
        Seq(
            StructField(DataSources.FIELD_ITEM, StringType, true,
Metadata.empty),
            StructField(DataSources.FIELD_START, TimestampType, true,
Metadata.empty),
            StructField(DataSources.FIELD_TARGET,
DataTypes.createArrayType(DoubleType), true, Metadata.empty)
        )
    )

    var nn = dataset.sqlContext.createDataFrame(result.toDF.rdd, schema)
    nn.show
    nn
}

```

Error is like:

Exception in thread "main" java.lang.StackOverflowError
	at scala.collection.IndexedSeqLike$Elements.next(IndexedSeqLike.scala:65)
	at scala.StringContext.standardInterpolator(StringContext.scala:123)
	at scala.StringContext.s(StringContext.scala:95)
	at org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext.freshName(CodeGenerator.scala:565)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:105)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:104)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:104)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:153)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:152)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)
	at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1233)
	at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1223)
	at scala.collection.immutable.Stream.drop(Stream.scala:858)
	at scala.collection.immutable.Stream.drop(Stream.scala:202)
	at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:64)
	at scala.collection.immutable.Stream.apply(Stream.scala:202)
	at org.apache.spark.sql.catalyst.expressions.BoundReference.doGenCode(BoundAttribute.scala:62)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:107)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:104)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:104)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:153)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:152)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)
	at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1233)
	at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1223)
	at scala.collection.immutable.Stream.drop(Stream.scala:858)
	at scala.collection.immutable.Stream.drop(Stream.scala:202)
	at scala.collection.LinearSeqOptimized$class.apply(LinearSeqOptimized.scala:64)
	at scala.collection.immutable.Stream.apply(Stream.scala:202)
	at org.apache.spark.sql.catalyst.expressions.BoundReference.doGenCode(BoundAttribute.scala:62)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:107)
	at org.apache.spark.sql.catalyst.expressions.Expression$$anonfun$genCode$2.apply(Expression.scala:104)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.catalyst.expressions.Expression.genCode(Expression.scala:104)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:153)
	at org.apache.spark.sql.execution.CodegenSupport$$anonfun$4.apply(WholeStageCodegenExec.scala:152)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)
	at scala.collection.immutable.Stream$$anonfun$map$1.apply(Stream.scala:418)

-Chris

Mime
View raw message