spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Marco Mistroni <mmistr...@gmail.com>
Subject Re: createDataFrame causing a strange error.
Date Sun, 27 Nov 2016 20:11:29 GMT
Hi

pickle erros normally point to serialisation issue. i am suspecting
something wrong with ur S3 data , but is just a wild guess...

Is your s3 object publicly available?

few suggestions to nail down the problem

1 - try  to see if you can read your object from s3 using boto3 library
'offline', meaning not in a spark code

2 - try to replace your distributedJsonRead. instead of reading from s3,
generate a string out of a snippet of your json object

3 - Spark can read  data from s3 as well , just do  a
sc.textFile('s3://....) ==>
http://www.sparktutorials.net/reading-and-writing-s3-data-with-apache-spark.
Try to se spark entirely to read and process the data, rather than go via
boto3. It adds an extra complexity which you dont need

If you send a snippet ofyour json content, then everyone on the list can
run the code and try to reproduce


hth

 Marco


On 27 Nov 2016 7:33 pm, "Andrew Holway" <andrew.holway@otternetworks.de>
wrote:

> I get a slight different error when not specifying a schema:
>
> Traceback (most recent call last):
>   File "/home/centos/fun-functions/spark-parrallel-read-from-s3/tick.py",
> line 61, in <module>
>     df = sqlContext.createDataFrame(foo)
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/context.py",
> line 299, in createDataFrame
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/session.py",
> line 520, in createDataFrame
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/session.py",
> line 360, in _createFromRDD
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/session.py",
> line 331, in _inferSchema
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
> line 1328, in first
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
> line 1310, in take
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/context.py",
> line 941, in runJob
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
> line 2403, in _jrdd
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
> line 2336, in _wrap_function
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
> line 2315, in _prepare_for_python_RDD
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/serializers.py",
> line 428, in dumps
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 657, in dumps
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 107, in dump
>   File "/usr/lib64/python2.7/pickle.py", line 224, in dump
>     self.save(obj)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 562, in save_tuple
>     save(element)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 204, in save_function
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 241, in save_function_tuple
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>     save(element)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>     self._batch_appends(iter(obj))
>   File "/usr/lib64/python2.7/pickle.py", line 633, in _batch_appends
>     save(x)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 204, in save_function
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 241, in save_function_tuple
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>     save(element)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>     self._batch_appends(iter(obj))
>   File "/usr/lib64/python2.7/pickle.py", line 633, in _batch_appends
>     save(x)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 204, in save_function
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 241, in save_function_tuple
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>     save(element)
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>     self._batch_appends(iter(obj))
>   File "/usr/lib64/python2.7/pickle.py", line 636, in _batch_appends
>     save(tmp[0])
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 198, in save_function
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
> line 246, in save_function_tuple
>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>     f(self, obj) # Call unbound method with explicit self
>   File "/usr/lib64/python2.7/pickle.py", line 649, in save_dict
>     self._batch_setitems(obj.iteritems())
>   File "/usr/lib64/python2.7/pickle.py", line 681, in _batch_setitems
>     save(v)
>   File "/usr/lib64/python2.7/pickle.py", line 306, in save
>     rv = reduce(self.proto)
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py",
> line 933, in __call__
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/utils.py",
> line 63, in deco
>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py",
> line 316, in get_return_value
> py4j.protocol.Py4JError: An error occurred while calling
> o33.__getnewargs__. Trace:
> py4j.Py4JException: Method __getnewargs__([]) does not exist
> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
> at py4j.Gateway.invoke(Gateway.java:272)
> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
> at py4j.commands.CallCommand.execute(CallCommand.java:79)
> at py4j.GatewayConnection.run(GatewayConnection.java:211)
> at java.lang.Thread.run(Thread.java:745)
>
>
> On Sun, Nov 27, 2016 at 8:32 PM, Andrew Holway <
> andrew.holway@otternetworks.de> wrote:
>
>> Hi,
>>
>> Can anyone tell me what is causing this error
>> Spark 2.0.0
>> Python 2.7.5
>>
>> df = sqlContext.createDataFrame(foo, schema)
>> https://gist.github.com/mooperd/368e3453c29694c8b2c038d6b7b4413a
>>
>> Traceback (most recent call last):
>>   File "/home/centos/fun-functions/spark-parrallel-read-from-s3/tick.py",
>> line 61, in <module>
>>     df = sqlContext.createDataFrame(foo, schema)
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/context.py",
>> line 299, in createDataFrame
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/session.py",
>> line 523, in createDataFrame
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
>> line 2220, in _to_java_object_rdd
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
>> line 2403, in _jrdd
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
>> line 2336, in _wrap_function
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/rdd.py",
>> line 2315, in _prepare_for_python_RDD
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/serializers.py",
>> line 428, in dumps
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 657, in dumps
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 107, in dump
>>   File "/usr/lib64/python2.7/pickle.py", line 224, in dump
>>     self.save(obj)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 562, in save_tuple
>>     save(element)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 204, in save_function
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 241, in save_function_tuple
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>>     save(element)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>>     self._batch_appends(iter(obj))
>>   File "/usr/lib64/python2.7/pickle.py", line 633, in _batch_appends
>>     save(x)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 204, in save_function
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 241, in save_function_tuple
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>>     save(element)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>>     self._batch_appends(iter(obj))
>>   File "/usr/lib64/python2.7/pickle.py", line 633, in _batch_appends
>>     save(x)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 204, in save_function
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 241, in save_function_tuple
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 548, in save_tuple
>>     save(element)
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 600, in save_list
>>     self._batch_appends(iter(obj))
>>   File "/usr/lib64/python2.7/pickle.py", line 636, in _batch_appends
>>     save(tmp[0])
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 198, in save_function
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/cloudpickle.py",
>> line 246, in save_function_tuple
>>   File "/usr/lib64/python2.7/pickle.py", line 286, in save
>>     f(self, obj) # Call unbound method with explicit self
>>   File "/usr/lib64/python2.7/pickle.py", line 649, in save_dict
>>     self._batch_setitems(obj.iteritems())
>>   File "/usr/lib64/python2.7/pickle.py", line 681, in _batch_setitems
>>     save(v)
>>   File "/usr/lib64/python2.7/pickle.py", line 306, in save
>>     rv = reduce(self.proto)
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py",
>> line 933, in __call__
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/pyspark.zip/pyspark/sql/utils.py",
>> line 63, in deco
>>   File "/usr/hdp/2.5.0.0-1245/spark2/python/lib/py4j-0.10.1-src.zip/py4j/protocol.py",
>> line 316, in get_return_value
>> py4j.protocol.Py4JError: An error occurred while calling
>> o33.__getnewargs__. Trace:
>> py4j.Py4JException: Method __getnewargs__([]) does not exist
>> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
>> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
>> at py4j.Gateway.invoke(Gateway.java:272)
>> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:128)
>> at py4j.commands.CallCommand.execute(CallCommand.java:79)
>> at py4j.GatewayConnection.run(GatewayConnection.java:211)
>> at java.lang.Thread.run(Thread.java:745)
>>
>>
>>
>> --
>> Otter Networks UG
>> http://otternetworks.de
>> Gotenstra├če 17
>> 10829 Berlin
>>
>
>
>
> --
> Otter Networks UG
> http://otternetworks.de
> Gotenstra├če 17
> 10829 Berlin
>

Mime
View raw message