hive提前过滤重要性

hive提前过滤

create table sospdm.tmp_yinfei_test_01 
(
    id string
)
partitioned by (statis_date string)
;

create table sospdm.tmp_yinfei_test_02 
(
    id string
)
partitioned by (statis_date string)
;

select t1.* 
from tmp_yinfei_test_01 t1 
left join tmp_yinfei_test_02 t2
on t1.id=t2.id
where t1.statis_date='20190408' and t2.statis_date='20190408'
;
select t1.* 
from tmp_yinfei_test_01 t1 
left join tmp_yinfei_test_02 t2
on t1.id=t2.id and t1.statis_date='20190408' and t2.statis_date='20190408'
;
select t1.*
from 
(
select * from tmp_yinfei_test_01 where statis_date='20190408'
) t1 
left join 
(
select * from tmp_yinfei_test_02 where statis_date='20190408'
) t2 
on t1.id=t2.id
;
=========================test1=====================================

explain select t1.* 
from tmp_yinfei_test_01 t1 
left join tmp_yinfei_test_02 t2
on t1.id=t2.id
where t1.statis_date='20190408' and t2.statis_date='20190408'
;

hive> explain select t1.* 
    > from tmp_yinfei_test_01 t1
    > left join tmp_yinfei_test_02 t2
    > on t1.id=t2.id
    > where t1.statis_date='20190408' and t2.statis_date='20190408'
    > ;
OK
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: t1
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Filter Operator
              predicate: (statis_date = '20190408') (type: boolean)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              Reduce Output Operator
                key expressions: id (type: string)
                sort order: +
                Map-reduce partition columns: id (type: string)
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          TableScan
            alias: t2
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Reduce Output Operator
              key expressions: id (type: string)
              sort order: +
              Map-reduce partition columns: id (type: string)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              value expressions: statis_date (type: string)
      Reduce Operator Tree:
        Join Operator
          condition map:
               Left Outer Join0 to 1
          keys:
            0 id (type: string)
            1 id (type: string)
          outputColumnNames: _col0, _col6
          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          Filter Operator
            predicate: (_col6 = '20190408') (type: boolean)
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Select Operator
              expressions: _col0 (type: string), '20190408' (type: string)
              outputColumnNames: _col0, _col1
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              File Output Operator
                compressed: true
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                table:
                    input format: org.apache.hadoop.mapred.TextInputFormat
                    output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink

Time taken: 0.399 seconds, Fetched: 58 row(s)

结论:t2表会扫全表
=========================test2=====================================
explain select t1.* 
from tmp_yinfei_test_01 t1 
left join tmp_yinfei_test_02 t2
on t1.id=t2.id and t1.statis_date='20190408' and t2.statis_date='20190408'
;
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: t1
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Reduce Output Operator
              key expressions: id (type: string)
              sort order: +
              Map-reduce partition columns: id (type: string)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              value expressions: statis_date (type: string)
          TableScan
            alias: t2
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Filter Operator
              predicate: (statis_date = '20190408') (type: boolean)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              Reduce Output Operator
                key expressions: id (type: string)
                sort order: +
                Map-reduce partition columns: id (type: string)
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
      Reduce Operator Tree:
        Join Operator
          condition map:
               Left Outer Join0 to 1
          filter predicates:
            0 {(VALUE._col0 = '20190408')}
            1 
          keys:
            0 id (type: string)
            1 id (type: string)
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          File Output Operator
            compressed: true
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink
结论:t1表会扫全表
=========================test3=====================================
explain select t1.*
from 
(
select * from tmp_yinfei_test_01 where statis_date='20190408'
) t1 
left join 
(
select * from tmp_yinfei_test_02 where statis_date='20190408'
) t2 
on t1.id=t2.id
;
STAGE DEPENDENCIES:
  Stage-1 is a root stage
  Stage-0 depends on stages: Stage-1

STAGE PLANS:
  Stage: Stage-1
    Map Reduce
      Map Operator Tree:
          TableScan
            alias: tmp_yinfei_test_01
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Filter Operator
              predicate: (statis_date = '20190408') (type: boolean)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              Select Operator
                expressions: id (type: string), '20190408' (type: string)
                outputColumnNames: _col0, _col1
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                Reduce Output Operator
                  key expressions: _col0 (type: string)
                  sort order: +
                  Map-reduce partition columns: _col0 (type: string)
                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                  value expressions: _col1 (type: string)
          TableScan
            alias: tmp_yinfei_test_02
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            Filter Operator
              predicate: (statis_date = '20190408') (type: boolean)
              Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
              Select Operator
                expressions: id (type: string)
                outputColumnNames: _col0
                Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
                Reduce Output Operator
                  key expressions: _col0 (type: string)
                  sort order: +
                  Map-reduce partition columns: _col0 (type: string)
                  Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
      Reduce Operator Tree:
        Join Operator
          condition map:
               Left Outer Join0 to 1
          keys:
            0 _col0 (type: string)
            1 _col0 (type: string)
          outputColumnNames: _col0, _col1
          Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
          File Output Operator
            compressed: true
            Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE
            table:
                input format: org.apache.hadoop.mapred.TextInputFormat
                output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe

  Stage: Stage-0
    Fetch Operator
      limit: -1
      Processor Tree:
        ListSink
原文地址:https://www.cnblogs.com/yin-fei/p/10751608.html