drill-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Victoria Markman (JIRA)" <j...@apache.org>
Subject [jira] [Created] (DRILL-2369) It takes too long to plan inner join query with 10 join predicates
Date Tue, 03 Mar 2015 23:26:05 GMT
Victoria Markman created DRILL-2369:
---------------------------------------

             Summary: It takes too long to plan inner join query with 10 join predicates
                 Key: DRILL-2369
                 URL: https://issues.apache.org/jira/browse/DRILL-2369
             Project: Apache Drill
          Issue Type: Bug
          Components: Query Planning & Optimization
            Reporter: Victoria Markman
            Assignee: Jinfeng Ni


Inner join with 10 join predicates takes 11 seconds to plan.
If number of predicates goes up to 14, planning time increases to 41 seconds.

(1) ON clause with 10 columns in join condition

-- Total elapsed time : 11.407 seconds
-- Planning time      : 11.03 seconds

{code}
select
        count(*)
from
        alltypes_with_nulls a
        INNER JOIN
        alltypes_with_nulls b ON
        (
        a.c_boolean = b.c_boolean
        AND a.c_timestamp = b.c_timestamp
        AND a.c_time = b.c_time
        AND a.c_date = b.c_date
        AND a.c_float = b.c_float
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigint = b.c_bigint
        AND a.c_integer = b.c_integer
        AND a.c_varchar = b.c_varchar
        )
;
{code}

Explain plan:

{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02        HashAgg(group=[{0}])
00-03          Project(c_date=[$3])
00-04            HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4,
$14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-06              Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4],
c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`,
`c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`,
`c_integer`, `c_varchar`]]])
00-05              Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3],
c_float0=[$4], c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8],
c_varchar0=[$9])
00-07                Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4],
c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-09                  Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`,
`c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`,
`c_integer`, `c_varchar`]]])
{code}

(2) 10 columns in JOIN condition, in the WHERE clause (just to check that nothing funny is
going on with ON clause)

-- Total elapsed time : 11.139 seconds
-- Planning time      : 11.416 seconds

{code}
select
        count(*)
from
        alltypes_with_nulls a,
        alltypes_with_nulls b
where
        a.c_boolean = b.c_boolean
        AND a.c_timestamp = b.c_timestamp
        AND a.c_time = b.c_time
        AND a.c_date = b.c_date
        AND a.c_float = b.c_float
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigint = b.c_bigint
        AND a.c_integer = b.c_integer
        AND a.c_varchar = b.c_varchar
;
{code}

Explain plan:

{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT()])
00-02        Project($f0=[0])
00-03          HashJoin(condition=[AND(=($0, $10), =($1, $11), =($2, $12), =($3, $13), =($4,
$14), =($5, $15), =($6, $16), =($7, $17), =($8, $18), =($9, $19))], joinType=[inner])
00-05            Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4],
c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-07              Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`,
`c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`,
`c_integer`, `c_varchar`]]])
00-04            Project(c_boolean0=[$0], c_timestamp0=[$1], c_time0=[$2], c_date0=[$3], c_float0=[$4],
c_bigdecimal0=[$5], c_smalldecimal0=[$6], c_bigint0=[$7], c_integer0=[$8], c_varchar0=[$9])
00-06              Project(c_boolean=[$9], c_timestamp=[$7], c_time=[$0], c_date=[$1], c_float=[$4],
c_bigdecimal=[$6], c_smalldecimal=[$5], c_bigint=[$2], c_integer=[$8], c_varchar=[$3])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_boolean`,
`c_timestamp`, `c_time`, `c_date`, `c_float`, `c_bigdecimal`, `c_smalldecimal`, `c_bigint`,
`c_integer`, `c_varchar`]]])
{code}

(3) 14 columns in join condition

-- Total elapsed time : 41.101 seconds
-- Planning time        : 39.771 seconds

{code}
select
        count(distinct a.c_date)
from
        alltypes_with_nulls a
        INNER JOIN
        alltypes_with_nulls b ON
        (
        a.c_varchar = b.c_varchar
        AND a.c_integer = b.c_integer
        AND a.c_bigint = b.c_bigint
        AND a.c_smalldecimal = b.c_smalldecimal
        AND a.c_bigdecimal = b.c_bigdecimal
        AND a.c_float = b.c_float
        AND a.c_date = b.c_date
        AND a.c_time = b.c_time
        AND a.c_timestamp = b.c_timestamp
        AND a.c_boolean = b.c_boolean
        AND a.d9 = b.d9
        AND a.d18 = b.d18
        AND a.d28 = b.d28
        AND a.d38 = b.d38
        )
;
{code}

Explain plan:
{code}
00-01      StreamAgg(group=[{}], EXPR$0=[COUNT($0)])
00-02        HashAgg(group=[{0}])
00-03          Project(c_date=[$6])
00-04            HashJoin(condition=[AND(=($0, $14), =($1, $15), =($2, $16), =($3, $17), =($4,
$18), =($5, $19), =($6, $20), =($7, $21), =($8, $22), =($9, $23), =($10, $24), =($11, $25),
=($12, $26), =($13, $27))], joinType=[inner])
00-06              Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6],
c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7],
d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-08                Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`,
`c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`,
`c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
00-05              Project(c_varchar0=[$0], c_integer0=[$1], c_bigint0=[$2], c_smalldecimal0=[$3],
c_bigdecimal0=[$4], c_float0=[$5], c_date0=[$6], c_time0=[$7], c_timestamp0=[$8], c_boolean0=[$9],
d90=[$10], d180=[$11], d280=[$12], d380=[$13])
00-07                Project(c_varchar=[$5], c_integer=[$11], c_bigint=[$2], c_smalldecimal=[$6],
c_bigdecimal=[$4], c_float=[$1], c_date=[$10], c_time=[$3], c_timestamp=[$9], c_boolean=[$7],
d9=[$13], d18=[$8], d28=[$0], d38=[$12])
00-09                  Scan(groupscan=[ParquetGroupScan [entries=[ReadEntryWithPath [path=maprfs:/drill/testdata/aggregation/alltypes_with_nulls]],
selectionRoot=/drill/testdata/aggregation/alltypes_with_nulls, numFiles=1, columns=[`c_varchar`,
`c_integer`, `c_bigint`, `c_smalldecimal`, `c_bigdecimal`, `c_float`, `c_date`, `c_time`,
`c_timestamp`, `c_boolean`, `d9`, `d18`, `d28`, `d38`]]])
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Mime
View raw message