carbondata-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ravipes...@apache.org
Subject carbondata git commit: [CARBONDATA-2434] Add ExternalTableExample and LuceneDataMapExample
Date Tue, 08 May 2018 18:32:00 GMT
Repository: carbondata
Updated Branches:
  refs/heads/master 09feb9cc2 -> d5da9a194


[CARBONDATA-2434] Add ExternalTableExample and LuceneDataMapExample

For preparing 1.4.0 release.

This closes #2268


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/d5da9a19
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/d5da9a19
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/d5da9a19

Branch: refs/heads/master
Commit: d5da9a19434b9a28912110dac422cc89630ac93e
Parents: 09feb9c
Author: chenliang613 <chenliang613@huawei.com>
Authored: Fri May 4 10:35:52 2018 +0800
Committer: ravipesala <ravi.pesala@gmail.com>
Committed: Wed May 9 00:01:46 2018 +0530

----------------------------------------------------------------------
 .../examples/ExternalTableExample.scala         | 104 +++++++++++++++++
 .../examples/LuceneDataMapExample.scala         | 116 +++++++++++++++++++
 .../examples/PreAggregateDataMapExample.scala   |  12 +-
 .../carbondata/examplesCI/RunExamples.scala     |   8 ++
 4 files changed, 234 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala
b/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala
new file mode 100644
index 0000000..9d5ee8e
--- /dev/null
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/ExternalTableExample.scala
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.examples
+
+import java.io.File
+
+import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}
+
+import org.apache.carbondata.core.constants.CarbonCommonConstants
+import org.apache.carbondata.core.metadata.CarbonTableIdentifier
+import org.apache.carbondata.core.util.CarbonProperties
+import org.apache.carbondata.examples.util.ExampleUtils
+
+/**
+ * This example is for showing how to create external table with location.
+ */
+
+object ExternalTableExample {
+
+  def main(args: Array[String]) {
+    val spark = ExampleUtils.createCarbonSession("ExternalTableExample")
+    exampleBody(spark)
+    spark.close()
+  }
+
+  def exampleBody(spark : SparkSession): Unit = {
+
+    CarbonProperties.getInstance()
+      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd")
+
+    // Create origin_table
+    spark.sql("DROP TABLE IF EXISTS origin_table")
+    spark.sql(
+      s"""
+         | CREATE TABLE origin_table(
+         | shortField SHORT,
+         | intField INT,
+         | bigintField LONG,
+         | doubleField DOUBLE,
+         | stringField STRING,
+         | timestampField TIMESTAMP,
+         | decimalField DECIMAL(18,2),
+         | dateField DATE,
+         | charField CHAR(5),
+         | floatField FLOAT
+         | )
+         | STORED BY 'carbondata'
+       """.stripMargin)
+
+    val rootPath = new File(this.getClass.getResource("/").getPath
+                            + "../../../..").getCanonicalPath
+    val path = s"$rootPath/examples/spark2/src/main/resources/data.csv"
+
+    // load 4 times, each load has 10 rows data
+    // scalastyle:off
+    (1 to 4).foreach(_ => spark.sql(
+      s"""
+         | LOAD DATA LOCAL INPATH '$path'
+         | INTO TABLE origin_table
+         | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
+       """.stripMargin))
+    // scalastyle:on
+
+    // 40 rows
+    spark.sql("SELECT count(*) FROM origin_table").show()
+
+    val origin_table_path = CarbonEnv.getTablePath(Some("default"), "origin_table")(spark)
+
+    // Create external_table
+    spark.sql("DROP TABLE IF EXISTS external_table")
+    spark.sql("CREATE EXTERNAL TABLE external_table STORED BY 'carbondata'" +
+              s" LOCATION '$origin_table_path'")
+    spark.sql("SELECT count(*) FROM external_table").show()
+
+    // Load 2 times again
+    (1 to 2).foreach(_ => spark.sql(
+      s"""
+         | LOAD DATA LOCAL INPATH '$path'
+         | INTO TABLE origin_table
+         | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
+       """.stripMargin))
+
+    spark.sql("SELECT count(*) FROM external_table").show()
+
+    // Drop tables
+    spark.sql("DROP TABLE IF EXISTS origin_table")
+    spark.sql("DROP TABLE IF EXISTS external_table")
+  }
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala
b/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala
new file mode 100644
index 0000000..efe2a63
--- /dev/null
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/LuceneDataMapExample.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.carbondata.examples
+
+import org.apache.spark.sql.{SaveMode, SparkSession}
+
+import org.apache.carbondata.examples.util.ExampleUtils
+
+
+/**
+ * This example is for lucene datamap.
+ */
+
+object LuceneDataMapExample {
+
+  def main(args: Array[String]) {
+    val spark = ExampleUtils.createCarbonSession("LuceneDataMapExample")
+    exampleBody(spark)
+    spark.close()
+  }
+
+  def exampleBody(spark : SparkSession): Unit = {
+
+    // build the test data, please increase the data for more obvious comparison.
+    // if set the data is larger than 100M, it will take 10+ mins.
+    import scala.util.Random
+
+    import spark.implicits._
+    val r = new Random()
+    val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000)
+      .map(x => ("which test" + r.nextInt(10000) + " good" + r.nextInt(10),
+      "who and name" + x % 8, "city" + x % 50, x % 60))
+      .toDF("id", "name", "city", "age")
+
+    spark.sql("DROP TABLE IF EXISTS personTable")
+    df.write.format("carbondata")
+      .option("tableName", "personTable")
+      .option("compress", "true")
+      .mode(SaveMode.Overwrite).save()
+
+    // create lucene datamap on personTable
+    spark.sql(
+      s"""
+         | CREATE DATAMAP IF NOT EXISTS dm ON TABLE personTable
+         | USING 'lucene'
+         | DMProperties('INDEX_COLUMNS'='id , name')
+      """.stripMargin)
+
+    spark.sql("refresh datamap dm ON TABLE personTable")
+
+    // 1. Compare the performance:
+
+    def time(code: => Unit): Double = {
+      val start = System.currentTimeMillis()
+      code
+      // return time in second
+      (System.currentTimeMillis() - start).toDouble / 1000
+    }
+
+    val time_without_lucenedatamap = time {
+
+      spark.sql(
+        s"""
+           | SELECT count(*)
+           | FROM personTable where id like '% test1 %'
+      """.stripMargin).show()
+
+    }
+
+    val time_with_lucenedatamap = time {
+
+      spark.sql(
+        s"""
+           | SELECT count(*)
+           | FROM personTable where TEXT_MATCH('id:test1')
+      """.stripMargin).show()
+
+    }
+
+    // scalastyle:off
+    println("time for query on table with lucene datamap table:" + time_with_lucenedatamap.toString)
+    println("time for query on table without lucene datamap table:" + time_without_lucenedatamap.toString)
+    // scalastyle:on
+
+    // 2. Search for word "test1" and not "good" in the id field
+    spark.sql(
+      s"""
+         | SELECT id,name
+         | FROM personTable where TEXT_MATCH('id:test1 -id:good1')
+      """.stripMargin).show(100)
+
+     // 3. TEXT_MATCH_WITH_LIMIT usage:
+    spark.sql(
+      s"""
+         | SELECT id,name
+         | FROM personTable where TEXT_MATCH_WITH_LIMIT('id:test1',10)
+      """.stripMargin).show()
+
+    spark.sql("DROP TABLE IF EXISTS personTable")
+  }
+}

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala
b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala
index 367c011..b008bd4 100644
--- a/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala
+++ b/examples/spark2/src/main/scala/org/apache/carbondata/examples/PreAggregateDataMapExample.scala
@@ -65,10 +65,6 @@ object PreAggregateDataMapExample {
        LOAD DATA LOCAL INPATH '$testData' into table mainTable
        """)
 
-    spark.sql("""
-       select * from mainTable
-       """)
-
     spark.sql(s"""
        LOAD DATA LOCAL INPATH '$testData' into table mainTable_other
        """)
@@ -152,16 +148,20 @@ object PreAggregateDataMapExample {
 
     // 2.compare the performance : with pre-aggregate VS main table
 
-    // build test data, if set the data is larger than 100M, it will take 10+ mins.
+    // build the test data, please increase the data for more obvious comparison.
+    // if set the data is larger than 100M, it will take 10+ mins.
+
     import spark.implicits._
 
     import scala.util.Random
     val r = new Random()
-    val df = spark.sparkContext.parallelize(1 to 10 * 1000 * 1000)
+    val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000)
       .map(x => ("No." + r.nextInt(100000), "name" + x % 8, "city" + x % 50, x % 60))
       .toDF("ID", "name", "city", "age")
 
     // Create table with pre-aggregate
+    spark.sql("DROP TABLE IF EXISTS personTable")
+    spark.sql("DROP TABLE IF EXISTS personTableWithoutAgg")
     df.write.format("carbondata")
       .option("tableName", "personTable")
       .option("compress", "true")

http://git-wip-us.apache.org/repos/asf/carbondata/blob/d5da9a19/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
----------------------------------------------------------------------
diff --git a/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
b/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
index ddf8ee5..2b9b999 100644
--- a/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
+++ b/examples/spark2/src/test/scala/org/apache/carbondata/examplesCI/RunExamples.scala
@@ -105,4 +105,12 @@ class RunExamples extends QueryTest with BeforeAndAfterAll {
   test("TimeSeriesPreAggregateTableExample") {
     TimeSeriesPreAggregateTableExample.exampleBody(spark)
   }
+
+  test("LuceneDataMapExample") {
+    LuceneDataMapExample.exampleBody(spark)
+  }
+
+  test("ExternalTableExample") {
+    ExternalTableExample.exampleBody(spark)
+  }
 }
\ No newline at end of file


Mime
View raw message