[ https://issues.apache.org/jira/browse/SPARK-5707?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Cheng Lian updated SPARK-5707:
------------------------------
Description:
Exception thrown:
{noformat}
org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 133.0
failed 4 times, most recent failure: Lost task 13.3 in stage 133.0 (TID 3066, cdh52-node2):
java.io.IOException: com.esotericsoftware.kryo.KryoException: Unable to find class: __wrapper$1$81257352e1c844aebf09cb84fe9e7459.__wrapper$1$81257352e1c844aebf09cb84fe9e7459$SpecificRow$1
Serialization trace:
hashTable (org.apache.spark.sql.execution.joins.UniqueKeyHashedRelation)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1011)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:164)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:87)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:62)
at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:61)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:56)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
{noformat}
SQL:
{code:sql}
INSERT INTO TABLE ${hiveconf:TEMP_TABLE}
SELECT
s_store_name,
pr_review_date,
pr_review_content
FROM (
--select store_name for stores with flat or declining sales in 3 consecutive months.
SELECT s_store_name
FROM store s
JOIN (
-- linear regression part
SELECT
temp.cat AS cat,
--SUM(temp.x)as sumX,
--SUM(temp.y)as sumY,
--SUM(temp.xy)as sumXY,
--SUM(temp.xx)as sumXSquared,
--count(temp.x) as N,
--N * sumXY - sumX * sumY AS numerator,
--N * sumXSquared - sumX*sumX AS denom
--numerator / denom as slope,
--(sumY - slope * sumX) / N as intercept
--(count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) AS numerator,
--(count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) AS denom
--numerator / denom as slope,
--(sumY - slope * sumX) / N as intercept
((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x) * SUM(temp.xx)
- SUM(temp.x) * SUM(temp.x)) ) as slope,
(SUM(temp.y) - ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x)
* SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) * SUM(temp.x)) / count(temp.x) as intercept
FROM (
SELECT
s.ss_store_sk AS cat,
s.ss_sold_date_sk AS x,
SUM(s.ss_net_paid) AS y,
s.ss_sold_date_sk * SUM(s.ss_net_paid) AS xy,
s.ss_sold_date_sk*s.ss_sold_date_sk AS xx
FROM store_sales s
--select date range
LEFT SEMI JOIN (
SELECT d_date_sk
FROM date_dim d
WHERE d.d_date >= '${hiveconf:q18_startDate}'
AND d.d_date <= '${hiveconf:q18_endDate}'
) dd ON ( s.ss_sold_date_sk=dd.d_date_sk )
WHERE s.ss_store_sk <= 18
GROUP BY s.ss_store_sk, s.ss_sold_date_sk
) temp
GROUP BY temp.cat
) c on s.s_store_sk = c.cat
WHERE c.slope < 0
) tmp
JOIN product_reviews pr on (true)
WHERE instr(pr.pr_review_content, tmp.s_store_name) > 0
{code}
was:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 133.0
failed 4 times, most recent failure: Lost task 13.3 in stage 133.0 (TID 3066, cdh52-node2):
java.io.IOException: com.esotericsoftware.kryo.KryoException: Unable to find class: __wrapper$1$81257352e1c844aebf09cb84fe9e7459.__wrapper$1$81257352e1c844aebf09cb84fe9e7459$SpecificRow$1
Serialization trace:
hashTable (org.apache.spark.sql.execution.joins.UniqueKeyHashedRelation)
at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1011)
at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:164)
at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:87)
at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:62)
at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:61)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
at org.apache.spark.scheduler.Task.run(Task.scala:56)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
SQL:
INSERT INTO TABLE ${hiveconf:TEMP_TABLE}
SELECT
s_store_name,
pr_review_date,
pr_review_content
FROM (
--select store_name for stores with flat or declining sales in 3 consecutive months.
SELECT s_store_name
FROM store s
JOIN (
-- linear regression part
SELECT
temp.cat AS cat,
--SUM(temp.x)as sumX,
--SUM(temp.y)as sumY,
--SUM(temp.xy)as sumXY,
--SUM(temp.xx)as sumXSquared,
--count(temp.x) as N,
--N * sumXY - sumX * sumY AS numerator,
--N * sumXSquared - sumX*sumX AS denom
--numerator / denom as slope,
--(sumY - slope * sumX) / N as intercept
--(count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) AS numerator,
--(count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) AS denom
--numerator / denom as slope,
--(sumY - slope * sumX) / N as intercept
((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x) * SUM(temp.xx)
- SUM(temp.x) * SUM(temp.x)) ) as slope,
(SUM(temp.y) - ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x)
* SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) * SUM(temp.x)) / count(temp.x) as intercept
FROM (
SELECT
s.ss_store_sk AS cat,
s.ss_sold_date_sk AS x,
SUM(s.ss_net_paid) AS y,
s.ss_sold_date_sk * SUM(s.ss_net_paid) AS xy,
s.ss_sold_date_sk*s.ss_sold_date_sk AS xx
FROM store_sales s
--select date range
LEFT SEMI JOIN (
SELECT d_date_sk
FROM date_dim d
WHERE d.d_date >= '${hiveconf:q18_startDate}'
AND d.d_date <= '${hiveconf:q18_endDate}'
) dd ON ( s.ss_sold_date_sk=dd.d_date_sk )
WHERE s.ss_store_sk <= 18
GROUP BY s.ss_store_sk, s.ss_sold_date_sk
) temp
GROUP BY temp.cat
) c on s.s_store_sk = c.cat
WHERE c.slope < 0
) tmp
JOIN product_reviews pr on (true)
WHERE instr(pr.pr_review_content, tmp.s_store_name) > 0
> Enabling spark.sql.codegen throws ClassNotFound exception
> ---------------------------------------------------------
>
> Key: SPARK-5707
> URL: https://issues.apache.org/jira/browse/SPARK-5707
> Project: Spark
> Issue Type: Bug
> Components: SQL
> Affects Versions: 1.2.0
> Environment: yarn-client mode, spark.sql.codegen=true
> Reporter: Yi Yao
>
> Exception thrown:
> {noformat}
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 133.0
failed 4 times, most recent failure: Lost task 13.3 in stage 133.0 (TID 3066, cdh52-node2):
java.io.IOException: com.esotericsoftware.kryo.KryoException: Unable to find class: __wrapper$1$81257352e1c844aebf09cb84fe9e7459.__wrapper$1$81257352e1c844aebf09cb84fe9e7459$SpecificRow$1
> Serialization trace:
> hashTable (org.apache.spark.sql.execution.joins.UniqueKeyHashedRelation)
> at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1011)
> at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:164)
> at org.apache.spark.broadcast.TorrentBroadcast._value$lzycompute(TorrentBroadcast.scala:64)
> at org.apache.spark.broadcast.TorrentBroadcast._value(TorrentBroadcast.scala:64)
> at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:87)
> at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
> at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:62)
> at org.apache.spark.sql.execution.joins.BroadcastHashJoin$$anonfun$3.apply(BroadcastHashJoin.scala:61)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
> at org.apache.spark.rdd.RDD$$anonfun$13.apply(RDD.scala:601)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MappedRDD.compute(MappedRDD.scala:31)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.CartesianRDD.compute(CartesianRDD.scala:75)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:263)
> at org.apache.spark.rdd.RDD.iterator(RDD.scala:230)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
> at org.apache.spark.scheduler.Task.run(Task.scala:56)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:196)
> at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> {noformat}
> SQL:
> {code:sql}
> INSERT INTO TABLE ${hiveconf:TEMP_TABLE}
> SELECT
> s_store_name,
> pr_review_date,
> pr_review_content
> FROM (
> --select store_name for stores with flat or declining sales in 3 consecutive months.
> SELECT s_store_name
> FROM store s
> JOIN (
> -- linear regression part
> SELECT
> temp.cat AS cat,
> --SUM(temp.x)as sumX,
> --SUM(temp.y)as sumY,
> --SUM(temp.xy)as sumXY,
> --SUM(temp.xx)as sumXSquared,
> --count(temp.x) as N,
> --N * sumXY - sumX * sumY AS numerator,
> --N * sumXSquared - sumX*sumX AS denom
> --numerator / denom as slope,
> --(sumY - slope * sumX) / N as intercept
> --(count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) AS numerator,
> --(count(temp.x) * SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) AS denom
> --numerator / denom as slope,
> --(sumY - slope * sumX) / N as intercept
> ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x) *
SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) as slope,
> (SUM(temp.y) - ((count(temp.x) * SUM(temp.xy) - SUM(temp.x) * SUM(temp.y)) / (count(temp.x)
* SUM(temp.xx) - SUM(temp.x) * SUM(temp.x)) ) * SUM(temp.x)) / count(temp.x) as intercept
> FROM (
> SELECT
> s.ss_store_sk AS cat,
> s.ss_sold_date_sk AS x,
> SUM(s.ss_net_paid) AS y,
> s.ss_sold_date_sk * SUM(s.ss_net_paid) AS xy,
> s.ss_sold_date_sk*s.ss_sold_date_sk AS xx
> FROM store_sales s
> --select date range
> LEFT SEMI JOIN (
> SELECT d_date_sk
> FROM date_dim d
> WHERE d.d_date >= '${hiveconf:q18_startDate}'
> AND d.d_date <= '${hiveconf:q18_endDate}'
> ) dd ON ( s.ss_sold_date_sk=dd.d_date_sk )
> WHERE s.ss_store_sk <= 18
> GROUP BY s.ss_store_sk, s.ss_sold_date_sk
> ) temp
> GROUP BY temp.cat
> ) c on s.s_store_sk = c.cat
> WHERE c.slope < 0
> ) tmp
> JOIN product_reviews pr on (true)
> WHERE instr(pr.pr_review_content, tmp.s_store_name) > 0
> {code}
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org
|