carbondata-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jack...@apache.org
Subject carbondata git commit: [CARBONDATA-2650][Datamap] Fix bugs in negative number of skipped blocklets
Date Thu, 05 Jul 2018 01:29:32 GMT
Repository: carbondata
Updated Branches:
  refs/heads/master 5195d7f50 -> aeb2ec4cd


[CARBONDATA-2650][Datamap] Fix bugs in negative number of skipped blocklets

Currently in carbondata, default blocklet datamap will be used to prune
blocklets. Then other indexdatamap will be used.
But the other index datamap works for segment scope, which in some
scenarios, the size of pruned result will be bigger than that of default
datamap, thus causing negative number of skipped blocklets in explain
query output.

Here we add intersection after pruning. If the pruned result size is
zero, we will finish the pruning.

This closes #2410


Project: http://git-wip-us.apache.org/repos/asf/carbondata/repo
Commit: http://git-wip-us.apache.org/repos/asf/carbondata/commit/aeb2ec4c
Tree: http://git-wip-us.apache.org/repos/asf/carbondata/tree/aeb2ec4c
Diff: http://git-wip-us.apache.org/repos/asf/carbondata/diff/aeb2ec4c

Branch: refs/heads/master
Commit: aeb2ec4cde000531806cbec4f726f039c935a495
Parents: 5195d7f
Author: xuchuanyin <xuchuanyin@hust.edu.cn>
Authored: Tue Jun 26 14:11:17 2018 +0800
Committer: Jacky Li <jacky.likun@qq.com>
Committed: Thu Jul 5 09:29:21 2018 +0800

----------------------------------------------------------------------
 .../hadoop/api/CarbonInputFormat.java           | 51 ++++++++++++--------
 .../bloom/BloomCoarseGrainDataMapSuite.scala    |  8 ++-
 2 files changed, 35 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/carbondata/blob/aeb2ec4c/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java
----------------------------------------------------------------------
diff --git a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java
b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java
index 3688026..4b30ddf 100644
--- a/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java
+++ b/hadoop/src/main/java/org/apache/carbondata/hadoop/api/CarbonInputFormat.java
@@ -66,6 +66,7 @@ import org.apache.carbondata.hadoop.CarbonRecordReader;
 import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport;
 import org.apache.carbondata.hadoop.readsupport.impl.DictionaryDecodeReadSupport;
 
+import org.apache.commons.collections.CollectionUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -415,6 +416,7 @@ m filterExpression
 
   /**
    * Prune the blocklets using the filter expression with available datamaps.
+   * First pruned with default blocklet datamap, then pruned with CG and FG datamaps
    */
   private List<ExtendedBlocklet> getPrunedBlocklets(JobContext job, CarbonTable carbonTable,
       FilterResolverIntf resolver, List<Segment> segmentIds) throws IOException {
@@ -431,13 +433,15 @@ m filterExpression
     DataMapJob dataMapJob = DataMapUtil.getDataMapJob(job.getConfiguration());
     List<PartitionSpec> partitionsToPrune = getPartitionsToPrune(job.getConfiguration());
     // First prune using default datamap on driver side.
-    DataMapExprWrapper dataMapExprWrapper = DataMapChooser
-        .getDefaultDataMap(getOrCreateCarbonTable(job.getConfiguration()), resolver);
-    List<ExtendedBlocklet> prunedBlocklets =
-        dataMapExprWrapper.prune(segmentIds, partitionsToPrune);
-
-    ExplainCollector.recordDefaultDataMapPruning(
-        dataMapExprWrapper.getDataMapSchema(), prunedBlocklets.size());
+    DataMapExprWrapper dataMapExprWrapper = DataMapChooser.getDefaultDataMap(
+        getOrCreateCarbonTable(job.getConfiguration()), resolver);
+    List<ExtendedBlocklet> prunedBlocklets = dataMapExprWrapper.prune(segmentIds,
+        partitionsToPrune);
+    ExplainCollector.recordDefaultDataMapPruning(dataMapExprWrapper.getDataMapSchema(),
+        prunedBlocklets.size());
+    if (prunedBlocklets.size() == 0) {
+      return prunedBlocklets;
+    }
 
     DataMapChooser chooser = new DataMapChooser(getOrCreateCarbonTable(job.getConfiguration()));
 
@@ -446,32 +450,41 @@ m filterExpression
     if (cgDataMapExprWrapper != null) {
       // Prune segments from already pruned blocklets
       pruneSegments(segmentIds, prunedBlocklets);
+      List<ExtendedBlocklet> cgPrunedBlocklets;
       // Again prune with CG datamap.
       if (distributedCG && dataMapJob != null) {
-        prunedBlocklets = DataMapUtil
-            .executeDataMapJob(carbonTable, resolver, segmentIds, cgDataMapExprWrapper, dataMapJob,
-                partitionsToPrune);
+        cgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable,
+            resolver, segmentIds, cgDataMapExprWrapper, dataMapJob, partitionsToPrune);
       } else {
-        prunedBlocklets = cgDataMapExprWrapper.prune(segmentIds, partitionsToPrune);
+        cgPrunedBlocklets = cgDataMapExprWrapper.prune(segmentIds, partitionsToPrune);
       }
-
+      // since index datamap prune in segment scope,
+      // the result need to intersect with previous pruned result
+      prunedBlocklets = (List) CollectionUtils.intersection(
+          cgPrunedBlocklets, prunedBlocklets);
       ExplainCollector.recordCGDataMapPruning(
           cgDataMapExprWrapper.getDataMapSchema(), prunedBlocklets.size());
     }
+
+    if (prunedBlocklets.size() == 0) {
+      return prunedBlocklets;
+    }
     // Now try to prune with FG DataMap.
     if (isFgDataMapPruningEnable(job.getConfiguration()) && dataMapJob != null) {
       DataMapExprWrapper fgDataMapExprWrapper = chooser.chooseFGDataMap(resolver);
       if (fgDataMapExprWrapper != null) {
         // Prune segments from already pruned blocklets
         pruneSegments(segmentIds, prunedBlocklets);
-        prunedBlocklets = DataMapUtil
-            .executeDataMapJob(carbonTable, resolver, segmentIds, fgDataMapExprWrapper, dataMapJob,
-                partitionsToPrune);
-
-        ExplainCollector.recordFGDataMapPruning(
-            fgDataMapExprWrapper.getDataMapSchema(), prunedBlocklets.size());
+        List<ExtendedBlocklet> fgPrunedBlocklets = DataMapUtil.executeDataMapJob(carbonTable,
+            resolver, segmentIds, fgDataMapExprWrapper, dataMapJob, partitionsToPrune);
+        // note that the 'fgPrunedBlocklets' has extra datamap related info compared with
+        // 'prunedBlocklets', so the intersection should keep the elements in 'fgPrunedBlocklets'
+        prunedBlocklets = (List) CollectionUtils.intersection(fgPrunedBlocklets,
+            prunedBlocklets);
+        ExplainCollector.recordFGDataMapPruning(fgDataMapExprWrapper.getDataMapSchema(),
+            prunedBlocklets.size());
       }
-    } // TODO: add a else branch to push FGDataMap pruning to reader side
+    }
     return prunedBlocklets;
   }
 

http://git-wip-us.apache.org/repos/asf/carbondata/blob/aeb2ec4c/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapSuite.scala
----------------------------------------------------------------------
diff --git a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapSuite.scala
b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapSuite.scala
index a8e4193..a29f0f2 100644
--- a/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapSuite.scala
+++ b/integration/spark2/src/test/scala/org/apache/carbondata/datamap/bloom/BloomCoarseGrainDataMapSuite.scala
@@ -50,11 +50,9 @@ class BloomCoarseGrainDataMapSuite extends QueryTest with BeforeAndAfterAll
with
   }
 
   private def checkSqlHitDataMap(sqlText: String, dataMapName: String, shouldHit: Boolean):
DataFrame = {
-    if (shouldHit) {
-      assert(sqlContext.sparkSession.asInstanceOf[CarbonSession].isDataMapHit(sqlText, dataMapName))
-    } else {
-      assert(!sqlContext.sparkSession.asInstanceOf[CarbonSession].isDataMapHit(sqlText, dataMapName))
-    }
+    // ignore checking datamap hit, because bloom bloom datamap may be skipped if
+    // default blocklet datamap pruned all the blocklets.
+    // We cannot tell whether the index datamap will be hit from the query.
     sql(sqlText)
   }
 


Mime
View raw message