spark-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Chetan Khatri <chetan.opensou...@gmail.com>
Subject Re: About saving DataFrame to Hive 1.2.1 with Spark 2.0.1
Date Tue, 17 Jan 2017 05:00:12 GMT
Hello Spark Folks,

Other weird experience i have with Spark with SqlContext is when i created
Dataframe sometime this error throws exception and sometime not !

scala> import sqlContext.implicits._
import sqlContext.implicits._

scala> val stdDf = sqlContext.createDataFrame(rowRDD,empSchema.struct);
17/01/17 10:27:15 ERROR metastore.RetryingHMSHandler:
AlreadyExistsException(message:Database default already exists)
at
org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.create_database(HiveMetaStore.java:891)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:107)
at com.sun.proxy.$Proxy21.create_database(Unknown Source)
at
org.apache.hadoop.hive.metastore.HiveMetaStoreClient.createDatabase(HiveMetaStoreClient.java:644)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:156)
at com.sun.proxy.$Proxy22.createDatabase(Unknown Source)
at org.apache.hadoop.hive.ql.metadata.Hive.createDatabase(Hive.java:306)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply$mcV$sp(HiveClientImpl.scala:309)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply(HiveClientImpl.scala:309)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$createDatabase$1.apply(HiveClientImpl.scala:309)
at
org.apache.spark.sql.hive.client.HiveClientImpl$$anonfun$withHiveState$1.apply(HiveClientImpl.scala:280)
at
org.apache.spark.sql.hive.client.HiveClientImpl.liftedTree1$1(HiveClientImpl.scala:227)
at
org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:226)
at
org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:269)
at
org.apache.spark.sql.hive.client.HiveClientImpl.createDatabase(HiveClientImpl.scala:308)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply$mcV$sp(HiveExternalCatalog.scala:99)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply(HiveExternalCatalog.scala:99)
at
org.apache.spark.sql.hive.HiveExternalCatalog$$anonfun$createDatabase$1.apply(HiveExternalCatalog.scala:99)
at
org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:72)
at
org.apache.spark.sql.hive.HiveExternalCatalog.createDatabase(HiveExternalCatalog.scala:98)
at
org.apache.spark.sql.catalyst.catalog.SessionCatalog.createDatabase(SessionCatalog.scala:147)
at
org.apache.spark.sql.catalyst.catalog.SessionCatalog.<init>(SessionCatalog.scala:89)
at
org.apache.spark.sql.hive.HiveSessionCatalog.<init>(HiveSessionCatalog.scala:51)
at
org.apache.spark.sql.hive.HiveSessionState.catalog$lzycompute(HiveSessionState.scala:49)
at
org.apache.spark.sql.hive.HiveSessionState.catalog(HiveSessionState.scala:48)
at
org.apache.spark.sql.hive.HiveSessionState$$anon$1.<init>(HiveSessionState.scala:63)
at
org.apache.spark.sql.hive.HiveSessionState.analyzer$lzycompute(HiveSessionState.scala:63)
at
org.apache.spark.sql.hive.HiveSessionState.analyzer(HiveSessionState.scala:62)
at
org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:49)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:64)
at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:542)
at org.apache.spark.sql.SparkSession.createDataFrame(SparkSession.scala:302)
at org.apache.spark.sql.SQLContext.createDataFrame(SQLContext.scala:337)
at
$line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:43)
at
$line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:48)
at
$line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:50)
at $line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:52)
at $line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:54)
at $line28.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:56)
at $line28.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:58)
at $line28.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:60)
at $line28.$read$$iw$$iw$$iw$$iw.<init>(<console>:62)
at $line28.$read$$iw$$iw$$iw.<init>(<console>:64)
at $line28.$read$$iw$$iw.<init>(<console>:66)
at $line28.$read$$iw.<init>(<console>:68)
at $line28.$read.<init>(<console>:70)
at $line28.$read$.<init>(<console>:74)
at $line28.$read$.<clinit>(<console>)
at $line28.$eval$.$print$lzycompute(<console>:7)
at $line28.$eval$.$print(<console>:6)
at $line28.$eval.$print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
at
scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)
at
scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)
at
scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at
scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at
scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
at scala.tools.nsc.interpreter.ILoop.interpretStartingWith(ILoop.scala:807)
at scala.tools.nsc.interpreter.ILoop.command(ILoop.scala:681)
at scala.tools.nsc.interpreter.ILoop.processLine(ILoop.scala:395)
at scala.tools.nsc.interpreter.ILoop.loop(ILoop.scala:415)
at
scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply$mcZ$sp(ILoop.scala:923)
at
scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909)
at
scala.tools.nsc.interpreter.ILoop$$anonfun$process$1.apply(ILoop.scala:909)
at
scala.reflect.internal.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:97)
at scala.tools.nsc.interpreter.ILoop.process(ILoop.scala:909)
at org.apache.spark.repl.Main$.doMain(Main.scala:68)
at org.apache.spark.repl.Main$.main(Main.scala:51)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:736)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:185)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:210)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:124)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)

stdDf: org.apache.spark.sql.DataFrame = [stid: string, name: string ... 3
more fields]

again same works without exception:

scala> import sqlContext.implicits._
import sqlContext.implicits._

scala> val stdDf = sqlContext.createDataFrame(rowRDD,empSchema.struct);
stdDf: org.apache.spark.sql.DataFrame = [stid: string, name: string ... 3
more fields]


Thanks.


On Tue, Jan 17, 2017 at 12:48 AM, Chetan Khatri <chetan.opensource@gmail.com
> wrote:

> Hello Community,
>
> I am struggling to save Dataframe to Hive Table,
>
> Versions:
>
> Hive 1.2.1
> Spark 2.0.1
>
> *Working code:*
>
> /*
> @Author: Chetan Khatri
> /* @Author: Chetan Khatri Description: This Scala script has written for
> HBase to Hive module, which reads table from HBase and dump it out to Hive
> */ import it.nerdammer.spark.hbase._ import org.apache.spark.sql.Row import
> org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField
> import org.apache.spark.sql.types.StringType import org.apache.spark.sql.SparkSession
> // Approach 1: // Read HBase Table val hBaseRDD =
> sc.hbaseTable[(Option[String], Option[String], Option[String],
> Option[String], Option[String])]("university").select("stid",
> "name","subject","grade","city").inColumnFamily("emp") // Iterate
> HBaseRDD and generate RDD[Row] val rowRDD = hBaseRDD.map(i =>
> Row(i._1.get,i._2.get,i._3.get,i._4.get,i._5.get)) // Create sqlContext
> for createDataFrame method val sqlContext = new org.apache.spark.sql.SQLContext(sc)
> // Create Schema Structure object empSchema { val stid =
> StructField("stid", StringType) val name = StructField("name", StringType)
> val subject = StructField("subject", StringType) val grade =
> StructField("grade", StringType) val city = StructField("city", StringType)
> val struct = StructType(Array(stid, name, subject, grade, city)) } import
> sqlContext.implicits._ // Create DataFrame with rowRDD and Schema structure
> val stdDf = sqlContext.createDataFrame(rowRDD,empSchema.struct); //
> Importing Hive import org.apache.spark.sql.hive // Enable Hive with Hive
> warehouse in SparkSession val spark = SparkSession.builder().appName("Spark
> Hive Example").config("spark.sql.warehouse.dir",
> "/usr/local/hive/warehouse/").enableHiveSupport().getOrCreate() // Saving
> Dataframe to Hive Table Successfully. stdDf.write.mode("append").saveAsTable("employee")
> // Approach 2 : Where error comes import spark.implicits._ import spark.sql
> sql("use default") sql("create table employee(stid STRING, name STRING,
> subject STRING, grade STRING, city STRING)") scala> sql("show
> TABLES").show() +---------+-----------+ |tableName|isTemporary|
> +---------+-----------+ | employee| false| +---------+-----------+
> stdDf.write.mode("append").saveAsTable("employee") ERROR Exception:
> org.apache.spark.sql.AnalysisException: Saving data in MetastoreRelation
> default, employee is not supported.; at org.apache.spark.sql.
> execution.command.CreateDataSourceTableAsSelectCommand.run(
> createDataSourceTables.scala:221) at org.apache.spark.sql.
> execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
> at org.apache.spark.sql.execution.command.ExecutedCommandExec.
> sideEffectResult(commands.scala:56) at org.apache.spark.sql.
> execution.command.ExecutedCommandExec.doExecute(commands.scala:74) at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$
> execute$1.apply(SparkPlan.scala:115) at org.apache.spark.sql.
> execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115) at
> org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136)
> at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133)
> at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114)
> at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86)
> at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86)
> at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:378)
> at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:354)
> ... 56 elided Questions: At Approach 1, It stores data where hive table is
> not previously created, when i say saveAsTable it automatically creates for
> me and next time it also appends data into that, How to store data in
> previously created tables ?
> It also gives warning WARN metastore.HiveMetaStore: Location:
> file:/usr/local/spark/spark-warehouse/employee specified for non-external
> table:employee but i have already provided path of HiveMetaStore then why
> it is storing in spark's warehouse meta-store.
>
> Hive-setup done with reference to: http://mitu.co.in/wp-
> content/uploads/2015/12/Hive-Installation-on-Ubuntu-14.04-
> and-Hadoop-2.6.3.pdf and it's working well, I could not change the Hive
> version, it must be 1.2.1
>
> Thank you.
>
>
>
>
>

Mime
View raw message