spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Anwar AliKhan <anwaralikhan...@gmail.com>
Subject Re: Hey good looking toPandas () error stack
Date Sat, 20 Jun 2020 10:17:08 GMT
Two versions of Spark running against same code

https://towardsdatascience.com/your-first-apache-spark-ml-model-d2bb82b599dd

version spark-2.4.6-bin-hadoop2.7 is producing error for toPandas(). See
error stack below

Jupyter Notebook

import findspark

findspark.init('/home/spark-3.0.0-bin-hadoop2.7')

cell "spark"

cell output

SparkSession - in-memory

SparkContext

Spark UI

Version

v3.0.0

Master

local[*]

AppName

Titanic Data


import findspark

findspark.init('/home/spark-2.4.6-bin-hadoop2.7')

cell  "spark"



cell output

SparkSession - in-memory

SparkContext

Spark UI

Version

v2.4.6

Master

local[*]

AppName

Titanic Data

cell "df.show(5)"

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+

|PassengerId|Survived|Pclass|                Name|   Sex|Age|SibSp|Parch|
        Ticket|   Fare|Cabin|Embarked|

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+

|          1|       0|     3|Braund, Mr. Owen ...|  male| 22|    1|    0|
    A/5 21171|   7.25| null|       S|

|          2|       1|     1|Cumings, Mrs. Joh...|female| 38|    1|    0|
      PC 17599|71.2833|  C85|       C|

|          3|       1|     3|Heikkinen, Miss. ...|female| 26|    0|
0|STON/O2. 3101282|  7.925| null|       S|

|          4|       1|     1|Futrelle, Mrs. Ja...|female| 35|    1|    0|
        113803|   53.1| C123|       S|

|          5|       0|     3|Allen, Mr. Willia...|  male| 35|    0|    0|
        373450|   8.05| null|       S|

+-----------+--------+------+--------------------+------+---+-----+-----+----------------+-------+-----+--------+

only showing top 5 rows

cell "df.toPandas()"

cell output

---------------------------------------------------------------------------

Py4JJavaError                             Traceback (most recent call last)

/home/spark-2.4.6-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a,
**kw)

     62         try:

---> 63             return f(*a, **kw)

     64         except py4j.protocol.Py4JJavaError as e:

/home/spark-2.4.6-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)

    327                     "An error occurred while calling {0}{1}{2}.\n".

--> 328                     format(target_id, ".", name), value)

    329             else:

Py4JJavaError: An error occurred while calling o33.collectToPython.

: java.lang.IllegalArgumentException: Unsupported class file major version
55

    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:166)

    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:148)

    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:136)

    at org.apache.xbean.asm6.ClassReader.<init>(ClassReader.java:237)

    at
org.apache.spark.util.ClosureCleaner$.getClassReader(ClosureCleaner.scala:50)

    at
org.apache.spark.util.FieldAccessFinder$$anon$4$$anonfun$visitMethodInsn$7.apply(ClosureCleaner.scala:845)

    at
org.apache.spark.util.FieldAccessFinder$$anon$4$$anonfun$visitMethodInsn$7.apply(ClosureCleaner.scala:828)

    at
scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:733)

    at
scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)

    at
scala.collection.mutable.HashMap$$anon$1$$anonfun$foreach$2.apply(HashMap.scala:134)

    at
scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:236)

    at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:40)

    at scala.collection.mutable.HashMap$$anon$1.foreach(HashMap.scala:134)

    at
scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:732)

    at
org.apache.spark.util.FieldAccessFinder$$anon$4.visitMethodInsn(ClosureCleaner.scala:828)

    at org.apache.xbean.asm6.ClassReader.readCode(ClassReader.java:2175)

    at org.apache.xbean.asm6.ClassReader.readMethod(ClassReader.java:1238)

    at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:631)

    at org.apache.xbean.asm6.ClassReader.accept(ClassReader.java:355)

    at
org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:272)

    at
org.apache.spark.util.ClosureCleaner$$anonfun$org$apache$spark$util$ClosureCleaner$$clean$14.apply(ClosureCleaner.scala:271)

    at scala.collection.immutable.List.foreach(List.scala:392)

    at
org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:271)

    at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:163)

    at org.apache.spark.SparkContext.clean(SparkContext.scala:2326)

    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2100)

    at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)

    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)

    at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)

    at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)

    at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)

    at org.apache.spark.rdd.RDD.collect(RDD.scala:989)

    at
org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)

    at
org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)

    at
org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)

    at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)

    at
org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)

    at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)

    at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)

    at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)

    at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)

    at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native
Method)

    at
java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)

    at
java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)

    at java.base/java.lang.reflect.Method.invoke(Method.java:566)

    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)

    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)

    at py4j.Gateway.invoke(Gateway.java:282)

    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)

    at py4j.commands.CallCommand.execute(CallCommand.java:79)

    at py4j.GatewayConnection.run(GatewayConnection.java:238)

    at java.base/java.lang.Thread.run(Thread.java:834)


During handling of the above exception, another exception occurred:

IllegalArgumentException                  Traceback (most recent call last)

<ipython-input-10-a516097529d7> in <module>

----> 1 df.toPandas()

/home/spark-2.4.6-bin-hadoop2.7/python/pyspark/sql/dataframe.py in
toPandas(self)

   2153

   2154         # Below is toPandas without Arrow optimization.

-> 2155         pdf = pd.DataFrame.from_records(self.collect(),
columns=self.columns)

   2156         column_counter = Counter(self.columns)

   2157

/home/spark-2.4.6-bin-hadoop2.7/python/pyspark/sql/dataframe.py in
collect(self)

    533         """

    534         with SCCallSiteSync(self._sc) as css:

--> 535             sock_info = self._jdf.collectToPython()

    536         return list(_load_from_socket(sock_info,
BatchedSerializer(PickleSerializer())))

    537

/home/spark-2.4.6-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py
in __call__(self, *args)

   1255         answer = self.gateway_client.send_command(command)

   1256         return_value = get_return_value(

-> 1257             answer, self.gateway_client, self.target_id, self.name)

   1258

   1259         for temp_arg in temp_args:

/home/spark-2.4.6-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a,
**kw)

     77                 raise QueryExecutionException(s.split(': ', 1)[1],
stackTrace)

     78             if s.startswith('java.lang.IllegalArgumentException: '):

---> 79                 raise IllegalArgumentException(s.split(': ', 1)[1],
stackTrace)

     80             raise

     81     return deco

IllegalArgumentException: 'Unsupported class file major version 55'


On Fri, 19 Jun 2020, 08:06 Stephen Boesch, <javadba@gmail.com> wrote:

> afaik It has been there since  Spark 2.0 in 2015.   Not certain about
> Spark 1.5/1.6
>
> On Thu, 18 Jun 2020 at 23:56, Anwar AliKhan <anwaralikhanuae@gmail.com>
> wrote:
>
>> I first ran the  command
>> df.show()
>>
>> For sanity check of my dataFrame.
>>
>> I wasn't impressed with the display.
>>
>> I then ran
>> df.toPandas() in Jupiter Notebook.
>>
>> Now the display is really good looking .
>>
>> Is toPandas() a new function which became available in Spark 3.0 ?
>>
>>
>>
>>
>>
>>

Mime
View raw message