spark-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Chetan Khatri <chetan.opensou...@gmail.com>
Subject Re: Error: at sqlContext.createDataFrame with RDD and Schema
Date Wed, 28 Dec 2016 16:02:12 GMT
Resolved above error by creating SparkSession

val spark = SparkSession.builder().appName("Hbase - Spark
POC").getOrCreate()

Error after:

spark.sql("SELECT * FROM student").show()

But while doing show() action on Dataframe throws below error:

scala> sqlContext.sql("select * from student").show()
16/12/28 21:04:23 ERROR executor.Executor: Exception in task 0.0 in stage
2.0 (TID 2)
java.lang.RuntimeException: Error while encoding:
java.lang.RuntimeException: java.lang.Integer is not a valid external type
for schema of string
if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level row
object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 0, Rowid),
StringType), true) AS Rowid#35
+- if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level
row object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 0, Rowid),
StringType), true)
   :- assertnotnull(input[0, org.apache.spark.sql.Row, true], top level row
object).isNullAt
   :  :- assertnotnull(input[0, org.apache.spark.sql.Row, true], top level
row object)
   :  :  +- input[0, org.apache.spark.sql.Row, true]
   :  +- 0
   :- null
   +- staticinvoke(class org.apache.spark.unsafe.types.UTF8String,
StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 0, Rowid),
StringType), true)
      +- validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 0, Rowid),
StringType)
         +- getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 0, Rowid)
            +- assertnotnull(input[0, org.apache.spark.sql.Row, true], top
level row object)
               +- input[0, org.apache.spark.sql.Row, true]

if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level row
object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 1, maths),
StringType), true) AS maths#36
+- if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level
row object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 1, maths),
StringType), true)
   :- assertnotnull(input[0, org.apache.spark.sql.Row, true], top level row
object).isNullAt
   :  :- assertnotnull(input[0, org.apache.spark.sql.Row, true], top level
row object)
   :  :  +- input[0, org.apache.spark.sql.Row, true]
   :  +- 1
   :- null
   +- staticinvoke(class org.apache.spark.unsafe.types.UTF8String,
StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 1, maths),
StringType), true)
      +- validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 1, maths),
StringType)
         +- getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 1, maths)
            +- assertnotnull(input[0, org.apache.spark.sql.Row, true], top
level row object)
               +- input[0, org.apache.spark.sql.Row, true]

if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level row
object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 2, english),
StringType), true) AS english#37
+- if (assertnotnull(input[0, org.apache.spark.sql.Row, true], top level
row object).isNullAt) null else staticinvoke(class
org.apache.spark.unsafe.types.UTF8String, StringType, fromString,
validateexternaltype(getexternalrowfield(assertnotnull(input[0,
org.apache.spark.sql.Row, true], top level row object), 2, english),
StringType), true)

Kindly help, unable to check with error that what exactly is.

Thanks.,


On Wed, Dec 28, 2016 at 9:00 PM, Chetan Khatri <chetan.opensource@gmail.com>
wrote:

> Hello Spark Community,
>
> I am reading HBase table from Spark and getting RDD but now i wants to
> convert RDD of Spark Rows and want to convert to DF.
>
> *Source Code:*
>
> bin/spark-shell --packages it.nerdammer.bigdata:spark-hbase-connector_2.10:1.0.3
> --conf spark.hbase.host=127.0.0.1
>
> import it.nerdammer.spark.hbase._
> import org.apache.spark.{SparkConf, SparkContext}
> import org.apache.spark.sql.Row
> import org.apache.spark.sql.types.StructType
> import org.apache.spark.sql.types.StructField
> import org.apache.spark.sql.types.StringType
>
> val sparkConf = new SparkConf().setAppName("HBase Spark POC")
>
> val sparkContext = new SparkContext(sparkConf)
>
> val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
>
> val hBaseRDD = sc.hbaseTable[(Option[String], Option[Int], Option[Int],
> Option[Int], Option[Int], Option[Int])]("university").select("maths",
> "english","science","history","computer").inColumnFamily("school")
>
> val rowRDD = hBaseRDD.map(i => Row(i._1.get,i._2.get,i._3.
> get,i._4.get,i._5.get,i._6.get))
>
> val stdSchemaString= "Rowid,maths,english,science,history,computer"
>
> val stdSchema= StructType(stdSchemaString.split(",").map(fieldName =>
> StructField(fieldName, StringType, true)))
>
> val stdDf = sqlContext.createDataFrame(rowRDD,stdSchema);
>
> // Getting Error
>
> stdDf.registerTempTable("student")
>
> sqlContext.sql("select * from student").show()
>
> *Error*
>
> scala> val stdDf = sqlContext.createDataFrame(rowRDD,stdSchema);
> 16/12/28 20:50:59 ERROR metastore.RetryingHMSHandler:
> AlreadyExistsException(message:Database default already exists)
> at org.apache.hadoop.hive.metastore.HiveMetaStore$
> HMSHandler.create_database(HiveMetaStore.java:891)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(
> NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at org.apache.hadoop.hive.metastore.RetryingHMSHandler.
> invoke(RetryingHMSHandler.java:107)
> at com.sun.proxy.$Proxy21.create_database(Unknown Source)
> at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createDatabase(
> HiveMetaStoreClient.java:644)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(
> NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(
> RetryingMetaStoreClient.java:156)
> at com.sun.proxy.$Proxy22.createDatabase(Unknown Source)
> at org.apache.hadoop.hive.ql.metadata.Hive.createDatabase(Hive.java:306)
> at org.apache.spark.sql.hive.client.HiveClientImpl$$
> anonfun$createDatabase$1.apply$mcV$sp(HiveClientImpl.scala:309)
> at org.apache.spark.sql.hive.client.HiveClientImpl$$
> anonfun$createDatabase$1.apply(HiveClientImpl.scala:309)
> at org.apache.spark.sql.hive.client.HiveClientImpl$$
> anonfun$createDatabase$1.apply(HiveClientImpl.scala:309)
> at org.apache.spark.sql.hive.client.HiveClientImpl$$
> anonfun$withHiveState$1.apply(HiveClientImpl.scala:280)
> at org.apache.spark.sql.hive.client.HiveClientImpl.
> liftedTree1$1(HiveClientImpl.scala:227)
> at org.apache.spark.sql.hive.client.HiveClientImpl.
> retryLocked(HiveClientImpl.scala:226)
> at org.apache.spark.sql.hive.client.HiveClientImpl.
> withHiveState(HiveClientImpl.scala:269)
> at org.apache.spark.sql.hive.client.HiveClientImpl.
> createDatabase(HiveClientImpl.scala:308)
> at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$
> createDatabase$1.apply$mcV$sp(HiveExternalCatalog.scala:99)
> at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$
> createDatabase$1.apply(HiveExternalCatalog.scala:99)
> at org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$
> createDatabase$1.apply(HiveExternalCatalog.scala:99)
> at org.apache.spark.sql.hive.HiveExternalCatalog.withClient(
> HiveExternalCatalog.scala:72)
> at org.apache.spark.sql.hive.HiveExternalCatalog.createDatabase(
> HiveExternalCatalog.scala:98)
> at org.apache.spark.sql.catalyst.catalog.SessionCatalog.
> createDatabase(SessionCatalog.scala:147)
> at org.apache.spark.sql.catalyst.catalog.SessionCatalog.<init>(
> SessionCatalog.scala:89)
> at org.apache.spark.sql.hive.HiveSessionCatalog.<init>(
> HiveSessionCatalog.scala:51)
> at org.apache.spark.sql.hive.HiveSessionState.catalog$
> lzycompute(HiveSessionState.scala:49)
> at org.apache.spark.sql.hive.HiveSessionState.catalog(
> HiveSessionState.scala:48)
> at org.apache.spark.sql.hive.HiveSessionState$$anon$1.<
> init>(HiveSessionState.scala:63)
> at org.apache.spark.sql.hive.HiveSessionState.analyzer$
> lzycompute(HiveSessionState.scala:63)
> at org.apache.spark.sql.hive.HiveSessionState.analyzer(
> HiveSessionState.scala:62)
> at org.apache.spark.sql.execution.QueryExecution.
> assertAnalyzed(QueryExecution.scala:49)
> at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
> at org.apache.spark.sql.SparkSession.createDataFrame(
> SparkSession.scala:542)
> at org.apache.spark.sql.SparkSession.createDataFrame(
> SparkSession.scala:302)
> at org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:337)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>
> (<console>:42)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:47)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:49)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:51)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:53)
> at $line34.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:55)
> at $line34.$read$$iw$$iw$$iw$$iw.<init>(<console>:57)
> at $line34.$read$$iw$$iw$$iw.<init>(<console>:59)
> at $line34.$read$$iw$$iw.<init>(<console>:61)
> at $line34.$read$$iw.<init>(<console>:63)
> at $line34.$read.<init>(<console>:65)
> at $line34.$read$.<init>(<console>:69)
> at $line34.$read$.<clinit>(<console>)
> at $line34.$eval$.$print$lzycompute(<console>:7)
> at $line34.$eval$.$print(<console>:6)
> at $line34.$eval.$print(<console>)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(
> NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
> at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
> at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$
> loadAndRunReq$1.apply(IMain.scala:638)
> at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$
> loadAndRunReq$1.apply(IMain.scala:637)
> at scala.reflect.internal.util.ScalaClassLoader$class.
> asContext(ScalaClassLoader.scala:31)
> at scala.reflect.internal.util.AbstractFileClassLoader.asContext(
> AbstractFileClassLoader.scala:19)
> at scala.tools.nsc.interpreter.IMain$WrappedRequest.
> loadAndRunReq(IMain.scala:637)
> at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
> at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
> at scala.tools.nsc.interpreter.ILoop.interpretStartingWith(
> ILoop.scala:807)
> at scala.tools.nsc.interpreter.ILoop.command(ILoop.scala:681)
> at scala.tools.nsc.interpreter.ILoop.processLine(ILoop.scala:395)
> at scala.tools.nsc.interpreter.ILoop.loop(ILoop.scala:415)
> at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.
> apply$mcZ$sp(ILoop.scala:923)
> at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.
> apply(ILoop.scala:909)
> at scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.
> apply(ILoop.scala:909)
> at scala.reflect.internal.util.ScalaClassLoader$.savingContextLoader(
> ScalaClassLoader.scala:97)
> at scala.tools.nsc.interpreter.ILoop.process(ILoop.scala:909)
> at org.apache.spark.repl.Main$.doMain(Main.scala:68)
> at org.apache.spark.repl.Main$.main(Main.scala:51)
> at org.apache.spark.repl.Main.main(Main.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at sun.reflect.NativeMethodAccessorImpl.invoke(
> NativeMethodAccessorImpl.java:62)
> at sun.reflect.DelegatingMethodAccessorImpl.invoke(
> DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$
> deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
> at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
> at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
> at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
> at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
>
> stdDf: org.apache.spark.sql.DataFrame = [Rowid: string, maths: string ...
> 4 more fields]
>
> What would be resolution ?
>
> Thanks,
> Chetan
>
>
>
>
>

Mime
View raw message