hive-issues mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "ASF GitHub Bot (Jira)" <j...@apache.org>
Subject [jira] [Work logged] (HIVE-25067) Add more tests to Iceberg partition pruning
Date Mon, 26 Jul 2021 13:22:00 GMT

     [ https://issues.apache.org/jira/browse/HIVE-25067?focusedWorklogId=627738&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-627738
]

ASF GitHub Bot logged work on HIVE-25067:
-----------------------------------------

                Author: ASF GitHub Bot
            Created on: 26/Jul/21 13:21
            Start Date: 26/Jul/21 13:21
    Worklog Time Spent: 10m 
      Work Description: marton-bod commented on a change in pull request #2521:
URL: https://github.com/apache/hive/pull/2521#discussion_r676597621



##########
File path: iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_pruning.q
##########
@@ -0,0 +1,160 @@
+--! qt:dataset:srcpart
+set hive.compute.query.using.stats=false;
+set hive.mapred.mode=nonstrict;
+set hive.explain.user=false;
+set hive.optimize.ppd=true;
+set hive.ppd.remove.duplicatefilters=true;
+set hive.tez.dynamic.partition.pruning=true;
+set hive.optimize.metadataonly=false;
+set hive.optimize.index.filter=true;
+set hive.tez.min.bloom.filter.entries=1;
+set hive.tez.bigtable.minsize.semijoin.reduction=1;
+
+select distinct ds from srcpart;
+select distinct hr from srcpart;
+
+CREATE TABLE srcpart_iceberg (key STRING, value STRING)
+PARTITIONED BY (ds STRING, hr STRING) STORED BY iceberg;
+INSERT INTO srcpart_iceberg select * from srcpart;
+
+EXPLAIN create table srcpart_date_n2 as select ds as ds, ds as `date`  from srcpart group
by ds;
+create table srcpart_date_n2 as select ds as ds, ds as `date` from srcpart group by ds;
+create table srcpart_hour_n0 as select hr as hr, hr as hour from srcpart group by hr;
+create table srcpart_date_hour_n0 as select ds as ds, ds as `date`, hr as hr, hr as hour
from srcpart group by ds, hr;
+create table srcpart_double_hour_n0 as select (hr*2) as hr, hr as hour from srcpart group
by hr;
+
+-- single column, single key
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = '2008-04-08';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = '2008-04-08';
+set hive.tez.dynamic.partition.pruning=false;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = '2008-04-08';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = '2008-04-08';
+set hive.tez.dynamic.partition.pruning=true;
+select count(*) from srcpart where ds = '2008-04-08';
+
+-- multiple sources, single key
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+set hive.tez.dynamic.partition.pruning=false;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+set hive.tez.dynamic.partition.pruning=true;
+select count(*) from srcpart where hr = 11 and ds = '2008-04-08';
+
+-- multiple columns single source
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds
= srcpart_date_hour_n0.ds and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date`
= '2008-04-08' and srcpart_date_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds = srcpart_date_hour_n0.ds
and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date` = '2008-04-08'
and srcpart_date_hour_n0.hour = 11;
+set hive.tez.dynamic.partition.pruning=false;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds
= srcpart_date_hour_n0.ds and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date`
= '2008-04-08' and srcpart_date_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds = srcpart_date_hour_n0.ds
and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date` = '2008-04-08'
and srcpart_date_hour_n0.hour = 11;
+set hive.tez.dynamic.partition.pruning=true;
+select count(*) from srcpart where ds = '2008-04-08' and hr = 11;
+
+-- empty set
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = 'I DONT EXIST';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = 'I DONT EXIST';
+set hive.tez.dynamic.partition.pruning=false;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = 'I DONT EXIST';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = 'I DONT EXIST';
+set hive.tez.dynamic.partition.pruning=true;
+select count(*) from srcpart where ds = 'I DONT EXIST';
+
+-- expressions
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr
= cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr =
cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+set hive.tez.dynamic.partition.pruning=false;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr
= cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr =
cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+set hive.tez.dynamic.partition.pruning=true;
+
+-- old style join syntax
+EXPLAIN select count(*) from srcpart_iceberg, srcpart_date_hour_n0 where srcpart_date_hour_n0.`date`
= '2008-04-08' and srcpart_date_hour_n0.hour = 11 and srcpart_iceberg.ds = srcpart_date_hour_n0.ds
and srcpart_iceberg.hr = srcpart_date_hour_n0.hr;
+select count(*) from srcpart_iceberg, srcpart_date_hour_n0 where srcpart_date_hour_n0.`date`
= '2008-04-08' and srcpart_date_hour_n0.hour = 11 and srcpart_iceberg.ds = srcpart_date_hour_n0.ds
and srcpart_iceberg.hr = srcpart_date_hour_n0.hr;
+
+-- with static pruning
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11 and srcpart_iceberg.hr
= 11;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11 and srcpart_iceberg.hr
= 11;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_iceberg.hr = 13;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_iceberg.hr = 13;
+
+-- union + subquery
+EXPLAIN select count(*) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+select count(*) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+EXPLAIN select distinct(ds) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+select distinct(ds) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+EXPLAIN select ds from (select distinct(ds) as ds from srcpart_iceberg union all select distinct(ds)
as ds from srcpart_iceberg) s where s.ds in (select max(srcpart_iceberg.ds) from srcpart_iceberg
union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+select ds from (select distinct(ds) as ds from srcpart_iceberg union all select distinct(ds)
as ds from srcpart_iceberg) s where s.ds in (select max(srcpart_iceberg.ds) from srcpart_iceberg
union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+
+set hive.auto.convert.join=true;
+set hive.auto.convert.join.noconditionaltask = true;
+set hive.auto.convert.join.noconditionaltask.size = 10000000;
+
+-- single column, single key
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = '2008-04-08';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = '2008-04-08';
+select count(*) from srcpart where ds = '2008-04-08';
+
+-- multiple sources, single key
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11;
+select count(*) from srcpart where hr = 11 and ds = '2008-04-08';
+
+-- multiple columns single source
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds
= srcpart_date_hour_n0.ds and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date`
= '2008-04-08' and srcpart_date_hour_n0.hour = 11;
+select count(*) from srcpart_iceberg join srcpart_date_hour_n0 on (srcpart_iceberg.ds = srcpart_date_hour_n0.ds
and srcpart_iceberg.hr = srcpart_date_hour_n0.hr) where srcpart_date_hour_n0.`date` = '2008-04-08'
and srcpart_date_hour_n0.hour = 11;
+select count(*) from srcpart where ds = '2008-04-08' and hr = 11;
+
+-- empty set
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) where srcpart_date_n2.`date` = 'I DONT EXIST';
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
where srcpart_date_n2.`date` = 'I DONT EXIST';
+
+-- expressions
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr
= cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+select count(*) from srcpart_iceberg join srcpart_double_hour_n0 on (srcpart_iceberg.hr =
cast(cast(srcpart_double_hour_n0.hr/2 as int) as string)) where srcpart_double_hour_n0.hour
= 11;
+select count(*) from srcpart where hr = 11;
+
+-- with static pruning
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11 and srcpart_iceberg.hr
= 11;
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_hour_n0.hour = 11 and srcpart_iceberg.hr
= 11;
+EXPLAIN select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds
= srcpart_date_n2.ds) join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_iceberg.hr = 13;
+
+select count(*) from srcpart_iceberg join srcpart_date_n2 on (srcpart_iceberg.ds = srcpart_date_n2.ds)
join srcpart_hour_n0 on (srcpart_iceberg.hr = srcpart_hour_n0.hr)
+where srcpart_date_n2.`date` = '2008-04-08' and srcpart_iceberg.hr = 13;
+
+-- union + subquery
+EXPLAIN select distinct(ds) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+select distinct(ds) from srcpart_iceberg where srcpart_iceberg.ds in (select max(srcpart_iceberg.ds)
from srcpart_iceberg union all select min(srcpart_iceberg.ds) from srcpart_iceberg);
+
+
+-- Two iceberg tables
+
+create table srcpart_date_hour_n0_iceberg (ds string, `date` string, hr string, hour string)
+STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler';
+INSERT INTO srcpart_date_hour_n0_iceberg select ds as ds, ds as `date`, hr as hr, hr as hour
from srcpart group by ds, hr;
+
+set hive.vectorized.execution.enabled=false;

Review comment:
       should we remove it from here too?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscribe@hive.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Issue Time Tracking
-------------------

    Worklog Id:     (was: 627738)
    Time Spent: 40m  (was: 0.5h)

> Add more tests to Iceberg partition pruning
> -------------------------------------------
>
>                 Key: HIVE-25067
>                 URL: https://issues.apache.org/jira/browse/HIVE-25067
>             Project: Hive
>          Issue Type: Test
>            Reporter: Peter Vary
>            Assignee: Peter Vary
>            Priority: Major
>              Labels: pull-request-available
>          Time Spent: 40m
>  Remaining Estimate: 0h
>
> As we have qtest for Iceberg now, it would be good to add some partition pruning qtest
to have better coverage



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Mime
View raw message