spark-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Yi Zhou (JIRA)" <j...@apache.org>
Subject [jira] [Commented] (SPARK-8428) TimSort Comparison method violates its general contract with CLUSTER BY
Date Fri, 06 May 2016 02:17:12 GMT

    [ https://issues.apache.org/jira/browse/SPARK-8428?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15273534#comment-15273534
] 

Yi Zhou commented on SPARK-8428:
--------------------------------

We found the similar issue with Spark 1.6.1 in our larger data size test..I posted the details
like below. Then we try to increase the spark.sql.shuffle.partitions to resolve it. 

{code}
CREATE TABLE q26_spark_sql_run_query_0_temp (
  cid  BIGINT,
  id1  double,
  id2  double,
  id3  double,
  id4  double,
  id5  double,
  id6  double,
  id7  double,
  id8  double,
  id9  double,
  id10 double,
  id11 double,
  id12 double,
  id13 double,
  id14 double,
  id15 double
)

INSERT INTO TABLE q26_spark_sql_run_query_0_temp
SELECT
  ss.ss_customer_sk AS cid,
  count(CASE WHEN i.i_class_id=1  THEN 1 ELSE NULL END) AS id1,
  count(CASE WHEN i.i_class_id=2  THEN 1 ELSE NULL END) AS id2,
  count(CASE WHEN i.i_class_id=3  THEN 1 ELSE NULL END) AS id3,
  count(CASE WHEN i.i_class_id=4  THEN 1 ELSE NULL END) AS id4,
  count(CASE WHEN i.i_class_id=5  THEN 1 ELSE NULL END) AS id5,
  count(CASE WHEN i.i_class_id=6  THEN 1 ELSE NULL END) AS id6,
  count(CASE WHEN i.i_class_id=7  THEN 1 ELSE NULL END) AS id7,
  count(CASE WHEN i.i_class_id=8  THEN 1 ELSE NULL END) AS id8,
  count(CASE WHEN i.i_class_id=9  THEN 1 ELSE NULL END) AS id9,
  count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS id10,
  count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS id11,
  count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS id12,
  count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS id13,
  count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS id14,
  count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS id15
FROM store_sales ss
INNER JOIN item i
  ON (ss.ss_item_sk = i.i_item_sk
  AND i.i_category IN ('Books')
  AND ss.ss_customer_sk IS NOT NULL
)
GROUP BY ss.ss_customer_sk
HAVING count(ss.ss_item_sk) > 5
ORDER BY cid
{code}

{code}
16/05/05 14:50:03 WARN scheduler.TaskSetManager: Lost task 12.0 in stage 162.0 (TID 15153,
node6): java.lang.IllegalArgumentException: Comparison method violates its
general contract!
        at org.apache.spark.util.collection.TimSort$SortState.mergeLo(TimSort.java:794)
        at org.apache.spark.util.collection.TimSort$SortState.mergeAt(TimSort.java:525)
        at org.apache.spark.util.collection.TimSort$SortState.mergeCollapse(TimSort.java:453)
        at org.apache.spark.util.collection.TimSort$SortState.access$200(TimSort.java:325)
        at org.apache.spark.util.collection.TimSort.sort(TimSort.java:153)
        at org.apache.spark.util.collection.Sorter.sort(Sorter.scala:37)
        at org.apache.spark.util.collection.unsafe.sort.UnsafeInMemorySorter.getSortedIterator(UnsafeInMemorySorter.java:228)
        at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.spill(UnsafeExternalSorter.java:186)
        at org.apache.spark.memory.TaskMemoryManager.acquireExecutionMemory(TaskMemoryManager.java:175)
        at org.apache.spark.memory.TaskMemoryManager.allocatePage(TaskMemoryManager.java:249)
        at org.apache.spark.memory.MemoryConsumer.allocateArray(MemoryConsumer.java:83)
        at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.growPointerArrayIfNecessary(UnsafeExternalSorter.java:295)
        at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.insertRecord(UnsafeExternalSorter.java:330)
        at org.apache.spark.sql.execution.UnsafeExternalRowSorter.insertRow(UnsafeExternalRowSorter.java:91)
        at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:168)
        at org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:90)
        at org.apache.spark.sql.execution.Sort$$anonfun$1.apply(Sort.scala:64)
        at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728)
        at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$21.apply(RDD.scala:728)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:88)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
        at org.apache.spark.scheduler.Task.run(Task.scala:89)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)
{code}

> TimSort Comparison method violates its general contract with CLUSTER BY
> -----------------------------------------------------------------------
>
>                 Key: SPARK-8428
>                 URL: https://issues.apache.org/jira/browse/SPARK-8428
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 1.4.0
>         Environment: Oracle Java 7 
>            Reporter: Nathan McCarthy
>
> Running an SQL query that has a sub query and multiple left joins fails when there is
a CLUSTER BY (which implies a sortBy). This gives the following stack trace; 
> {code}
> Job aborted due to stage failure: Task 118 in stage 4.0 failed 4 times, most recent failure:
Lost task 118.3 in stage 4.0 (TID 18392, node142): java.lang.IllegalArgumentException: Comparison
method violates its general contract!
> 	at org.apache.spark.util.collection.TimSort$SortState.mergeHi(TimSort.java:900)
> 	at org.apache.spark.util.collection.TimSort$SortState.mergeAt(TimSort.java:509)
> 	at org.apache.spark.util.collection.TimSort$SortState.mergeCollapse(TimSort.java:435)
> 	at org.apache.spark.util.collection.TimSort$SortState.access$200(TimSort.java:307)
> 	at org.apache.spark.util.collection.TimSort.sort(TimSort.java:135)
> 	at org.apache.spark.util.collection.Sorter.sort(Sorter.scala:37)
> 	at org.apache.spark.util.collection.PartitionedPairBuffer.partitionedDestructiveSortedIterator(PartitionedPairBuffer.scala:70)
> 	at org.apache.spark.util.collection.ExternalSorter.partitionedIterator(ExternalSorter.scala:690)
> 	at org.apache.spark.util.collection.ExternalSorter.iterator(ExternalSorter.scala:708)
> 	at org.apache.spark.sql.execution.ExternalSort$$anonfun$doExecute$6$$anonfun$apply$7.apply(basicOperators.scala:222)
> 	at org.apache.spark.sql.execution.ExternalSort$$anonfun$doExecute$6$$anonfun$apply$7.apply(basicOperators.scala:218)
> 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
> 	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
> 	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
> 	at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
> 	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63)
> 	at org.apache.spark.scheduler.Task.run(Task.scala:70)
> 	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
> 	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> 	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> 	at java.lang.Thread.run(Thread.java:745)
> Driver stacktrace:
> {code}
> The query looks like;
> {code}
>  val df = sqlContext.sql("""SELECT CID
> |,	PW_END_DATE
> |,	PROD_NBR_KEY
> |,	SUM(CASE WHEN SUBST_IDX = 1 THEN L13W_SALE END) AS SUB1_L13W_SALE
> |FROM
> |(SELECT	BASE.CID
> |,	BASE.PW_END_DATE
> |,	BASE.PROD_NBR_KEY
> |,	SUBN.SUBST_IDX
> |,	CASE WHEN IDX.PW_END_DATE BETWEEN DATE_SUB(BASE.PW_END_DATE, 13*7 - 1) AND BASE.PW_END_DATE
THEN IDX.TOT_AMT_INCLD_GST END AS L13W_SALE
> |FROM TESTBASE BASE
> |LEFT JOIN TABLX SUBN
> |ON BASE.PROD_NBR_KEY = SUBN.PROD_NBR_KEY AND SUBN.SUBST_IDX <= 3
> |LEFT JOIN TABLEF IDX
> |ON BASE.CRN = IDX.CRN
> |AND SUBN.CROSS_PROD_NBR = IDX.PROD_NBR_KEY
> |) SUBSPREM
> | GROUP BY CRN, PW_END_DATE, PROD_NBR_KEY""".stripMargin)
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org


Mime
View raw message