spark-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Dongjoon Hyun (JIRA)" <j...@apache.org>
Subject [jira] [Updated] (SPARK-15466) Make `SparkSession` as the entry point to programming with RDD too
Date Sat, 21 May 2016 21:36:12 GMT

     [ https://issues.apache.org/jira/browse/SPARK-15466?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]

Dongjoon Hyun updated SPARK-15466:
----------------------------------
    Description: 
`SparkSession` greatly reduces the number of concepts which Spark users must know. Currently,
`SparkSession` is defined as the entry point to programming Spark with the Dataset and DataFrame
API. And, we can easily get `RDD` by calling `Dataset.rdd` or `DataFrame.rdd`, too.

However, many usages (including examples) are observed to extract `SparkSession.sparkContext`
and keep it as own variable to call `parallelize`.

If `SparkSession` supports RDD seamlessly too, it would be great for usability. We can do
this by simply adding `parallelize` API.

**Example**
{code:title=SparkPi.scala|borderStyle=solid}
 object SparkPi {
   def main(args: Array[String]) {
     val spark = SparkSession
       .builder
       .appName("Spark Pi")
       .getOrCreate()
-    val sc = spark.sparkContext
     val slices = if (args.length > 0) args(0).toInt else 2
     val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
-    val count = sc.parallelize(1 until n, slices).map { i =>
+    val count = spark.parallelize(1 until n, slices).map { i =>
     val count = spark.parallelize(1 until n, slices).map { i =>
       val x = random * 2 - 1
       val y = random * 2 - 1
       if (x*x + y*y < 1) 1 else 0
     }.reduce(_ + _)
     println("Pi is roughly " + 4.0 * count / n)
     spark.stop()
   }
 }
{code}

{code:title=pi.py|borderStyle=solid}
 spark = SparkSession\
   .builder\
   .appName("PythonPi")\
   .getOrCreate()

- sc = spark._sc
-
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 n = 100000 * partitions

 def f(_):
   x = random() * 2 - 1
   y = random() * 2 - 1
   return 1 if x ** 2 + y ** 2 < 1 else 0

-count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
 count = spark.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
 print("Pi is roughly %f" % (4.0 * count / n))

 spark.stop()
{code}

  was:
`SparkSession` greatly reduces the number of concepts which Spark users must know. Currently,
`SparkSession` is defined as the entry point to programming Spark with the Dataset and DataFrame
API. And, we can easily get `RDD` by calling `Dataset.rdd` or `DataFrame.rdd`, too.

However, many usages (including examples) are observed to extract `SparkSession.sparkContext`
and keep it as own variable to call `parallelize`.

If `SparkSession` supports RDD seamlessly too, it would be great for usability. We can do
this by simply adding `parallelize` API.

** Example **
{code:title=SparkPi.scala|borderStyle=solid}
 object SparkPi {
   def main(args: Array[String]) {
     val spark = SparkSession
       .builder
       .appName("Spark Pi")
       .getOrCreate()
-    val sc = spark.sparkContext
     val slices = if (args.length > 0) args(0).toInt else 2
     val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
-    val count = sc.parallelize(1 until n, slices).map { i =>
+    val count = spark.parallelize(1 until n, slices).map { i =>
     val count = spark.parallelize(1 until n, slices).map { i =>
       val x = random * 2 - 1
       val y = random * 2 - 1
       if (x*x + y*y < 1) 1 else 0
     }.reduce(_ + _)
     println("Pi is roughly " + 4.0 * count / n)
     spark.stop()
   }
 }
{code}

{code:title=pi.py|borderStyle=solid}
 spark = SparkSession\
   .builder\
   .appName("PythonPi")\
   .getOrCreate()

- sc = spark._sc
-
 partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
 n = 100000 * partitions

 def f(_):
   x = random() * 2 - 1
   y = random() * 2 - 1
   return 1 if x ** 2 + y ** 2 < 1 else 0

-count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
 count = spark.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
 print("Pi is roughly %f" % (4.0 * count / n))

 spark.stop()
{code}


> Make `SparkSession` as the entry point to programming with RDD too
> ------------------------------------------------------------------
>
>                 Key: SPARK-15466
>                 URL: https://issues.apache.org/jira/browse/SPARK-15466
>             Project: Spark
>          Issue Type: Improvement
>          Components: Examples, SQL
>            Reporter: Dongjoon Hyun
>
> `SparkSession` greatly reduces the number of concepts which Spark users must know. Currently,
`SparkSession` is defined as the entry point to programming Spark with the Dataset and DataFrame
API. And, we can easily get `RDD` by calling `Dataset.rdd` or `DataFrame.rdd`, too.
> However, many usages (including examples) are observed to extract `SparkSession.sparkContext`
and keep it as own variable to call `parallelize`.
> If `SparkSession` supports RDD seamlessly too, it would be great for usability. We can
do this by simply adding `parallelize` API.
> **Example**
> {code:title=SparkPi.scala|borderStyle=solid}
>  object SparkPi {
>    def main(args: Array[String]) {
>      val spark = SparkSession
>        .builder
>        .appName("Spark Pi")
>        .getOrCreate()
> -    val sc = spark.sparkContext
>      val slices = if (args.length > 0) args(0).toInt else 2
>      val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
> -    val count = sc.parallelize(1 until n, slices).map { i =>
> +    val count = spark.parallelize(1 until n, slices).map { i =>
>      val count = spark.parallelize(1 until n, slices).map { i =>
>        val x = random * 2 - 1
>        val y = random * 2 - 1
>        if (x*x + y*y < 1) 1 else 0
>      }.reduce(_ + _)
>      println("Pi is roughly " + 4.0 * count / n)
>      spark.stop()
>    }
>  }
> {code}
> {code:title=pi.py|borderStyle=solid}
>  spark = SparkSession\
>    .builder\
>    .appName("PythonPi")\
>    .getOrCreate()
> - sc = spark._sc
> -
>  partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
>  n = 100000 * partitions
>  def f(_):
>    x = random() * 2 - 1
>    y = random() * 2 - 1
>    return 1 if x ** 2 + y ** 2 < 1 else 0
> -count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
>  count = spark.parallelize(range(1, n + 1), partitions).map(f).reduce(add)
>  print("Pi is roughly %f" % (4.0 * count / n))
>  spark.stop()
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@spark.apache.org
For additional commands, e-mail: issues-help@spark.apache.org


Mime
View raw message