spark-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ajay Chander <itsche...@gmail.com>
Subject Code review / sqlContext Scope
Date Sat, 08 Oct 2016 17:17:26 GMT
Hi Everyone,

Can anyone tell me if there is anything wrong with my code flow below ?
Based on each element from the text file I would like to run a query
against Hive table and persist results in another Hive table. I want to do
this in parallel for each element in the file. I appreciate any of your
inputs on this.

$ cat /home/ajay/flds.txt
PHARMY_NPI_ID
ALT_SUPPLIER_STORE_NBR
MAIL_SERV_NBR

spark-shell  --name hivePersistTest --master yarn --deploy-mode client

val dataElementsFile = "/home/ajay/flds.txt"
val dataElements = Source.fromFile(dataElementsFile).getLines.toArray

def calculateQuery (de: String)  : DataFrame = {
  val calculatedQuery = sqlContext.sql("select 'UDA' as ds_nm, cyc_dt,
supplier_proc_i as supplier_proc_id, '" + de + "' as data_elm, " + de
+ " as data_elm_val," +
    " count(1) as derx_val_cnt, current_timestamp as load_dt " +
    "from SPRINT2_TEST2 " +
    "group by 'UDA', cyc_dt, supplier_proc_i, '" + de + "' , " + de + " ")

  return calculatedQuery
}

def persistResults (calculatedQuery: DataFrame) = {
  calculatedQuery.write.insertInto("sprint2_stp1_test2")
}

dataElements.map(calculateQuery).foreach(persistResults)


Thanks.

Mime
View raw message