spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Jerry <jerry.c...@gmail.com>
Subject Another issue with using lag and lead with data frames
Date Fri, 14 Aug 2015 16:50:59 GMT
So it seems like dataframes aren't going give me a break and just work. Now
it evaluates but goes nuts if it runs into a null case OR doesn't know how
to get the correct data type when I specify the default value as a string
expression. Let me know if anyone has a work around to this. PLEASE HELP
ME!!!  THIS IS DRIVING ME NUTS! Below is what I used:

JSON:
{"A":"a"},
{"A":"c"},
{"A":"B"},
{"A":"d"},
{"A":"A"},
{"A":null}
Reading json:
df = sqlContext.jsonFile("/home/........./Desktop/trash.json")


CASE 1 (no default):

*$ dfb = df.selectExpr("lag(A,1)")$ dfb.show()*
Java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


CASE 2 (with default):

*$ dfb = df.selectExpr("lag(A,1,'x')")$ dfb.show()*

java.lang.ClassCastException: java.lang.String cannot be cast to
org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:16:09 WARN TaskSetManager: Lost task 0.0 in stage 14.0 (TID 18,
localhost): java.lang.ClassCastException: java.lang.String cannot be cast
to org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:16:09 ERROR TaskSetManager: Task 0 in stage 14.0 failed 1
times; aborting job
15/08/14 09:16:09 INFO TaskSchedulerImpl: Removed TaskSet 14.0, whose tasks
have all completed, from pool
15/08/14 09:16:09 INFO TaskSchedulerImpl: Cancelling stage 14
15/08/14 09:16:09 INFO DAGScheduler: ResultStage 14 (show at <console>:24)
failed in 0.082 s
15/08/14 09:16:09 INFO DAGScheduler: Job 14 failed: show at <console>:24,
took 0.137699 s
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_25_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_27_piece0 on
localhost:33504 in memory (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:16:10 INFO BlockManagerInfo: Removed broadcast_26_piece0 on
localhost:33504 in memory (size: 19.3 KB, free: 264.9 MB)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 14.0 failed 1 times, most recent failure: Lost task 0.0 in stage
14.0 (TID 18, localhost): java.lang.ClassCastException: java.lang.String
cannot be cast to org.apache.hadoop.io.Text
    at
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector.copyObject(WritableStringObjectInspector.java:36)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:298)
    at
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.copyToStandardObject(ObjectInspectorUtils.java:251)
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:53)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)


scala> dfb = df.selectExpr("lag(A,1)")
dfb: org.apache.spark.sql.DataFrame = ['lag(A,1): string]

scala> dfb.show()
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(243712) called with
curMem=645979, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28 stored as values in
memory (estimated size 238.0 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(19750) called with
curMem=889691, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_28_piece0 stored as
bytes in memory (estimated size 19.3 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_28_piece0 in
memory on localhost:33504 (size: 19.3 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 28 from show at
<console>:24
15/08/14 09:17:29 INFO FileInputFormat: Total input paths to process : 1
15/08/14 09:17:29 INFO SparkContext: Starting job: show at <console>:24
15/08/14 09:17:29 INFO DAGScheduler: Got job 15 (show at <console>:24) with
1 output partitions (allowLocal=false)
15/08/14 09:17:29 INFO DAGScheduler: Final stage: ResultStage 15(show at
<console>:24)
15/08/14 09:17:29 INFO DAGScheduler: Parents of final stage: List()
15/08/14 09:17:29 INFO DAGScheduler: Missing parents: List()
15/08/14 09:17:29 INFO DAGScheduler: Submitting ResultStage 15
(MapPartitionsRDD[61] at show at <console>:24), which has no missing parents
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(5936) called with
curMem=909441, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29 stored as values in
memory (estimated size 5.8 KB, free 264.1 MB)
15/08/14 09:17:29 INFO MemoryStore: ensureFreeSpace(3259) called with
curMem=915377, maxMem=277842493
15/08/14 09:17:29 INFO MemoryStore: Block broadcast_29_piece0 stored as
bytes in memory (estimated size 3.2 KB, free 264.1 MB)
15/08/14 09:17:29 INFO BlockManagerInfo: Added broadcast_29_piece0 in
memory on localhost:33504 (size: 3.2 KB, free: 264.9 MB)
15/08/14 09:17:29 INFO SparkContext: Created broadcast 29 from broadcast at
DAGScheduler.scala:874
15/08/14 09:17:29 INFO DAGScheduler: Submitting 1 missing tasks from
ResultStage 15 (MapPartitionsRDD[61] at show at <console>:24)
15/08/14 09:17:29 INFO TaskSchedulerImpl: Adding task set 15.0 with 1 tasks
15/08/14 09:17:29 INFO TaskSetManager: Starting task 0.0 in stage 15.0 (TID
19, localhost, PROCESS_LOCAL, 1409 bytes)
15/08/14 09:17:29 INFO Executor: Running task 0.0 in stage 15.0 (TID 19)
15/08/14 09:17:29 INFO HadoopRDD: Input split:
file:/home/adminz/Desktop/trash.json:0+33
15/08/14 09:17:29 ERROR Executor: Exception in task 0.0 in stage 15.0 (TID
19)
java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)
15/08/14 09:17:29 WARN TaskSetManager: Lost task 0.0 in stage 15.0 (TID 19,
localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

15/08/14 09:17:29 ERROR TaskSetManager: Task 0 in stage 15.0 failed 1
times; aborting job
15/08/14 09:17:29 INFO TaskSchedulerImpl: Removed TaskSet 15.0, whose tasks
have all completed, from pool
15/08/14 09:17:29 INFO TaskSchedulerImpl: Cancelling stage 15
15/08/14 09:17:29 INFO DAGScheduler: ResultStage 15 (show at <console>:24)
failed in 0.069 s
15/08/14 09:17:29 INFO DAGScheduler: Job 15 failed: show at <console>:24,
took 0.112457 s
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage
15.0 (TID 19, localhost): java.lang.NullPointerException
    at
org.apache.hadoop.hive.ql.udf.generic.GenericUDFLeadLag.evaluate(GenericUDFLeadLag.java:57)
    at org.apache.spark.sql.hive.HiveGenericUdf.eval(hiveUdfs.scala:188)
    at
org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68)
    at
org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
    at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
    at scala.collection.Iterator$class.foreach(Iterator.scala:727)
    at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
    at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
    at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
    at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
    at scala.collection.AbstractIterator.to(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
    at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
    at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
    at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767)
    at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
    at org.apache.spark.scheduler.Task.run(Task.scala:70)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.runTask(ThreadPoolExecutor.java:895)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:918)
    at java.lang.Thread.run(Thread.java:662)

Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org
$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)

Mime
View raw message