Hive SQL Intro With Examples

1.常用函数

    from_unixtime(time_stamp, 'yyyy-MM-dd hh:mm:ss')
    unix_timestamp(time_string)
    to_date(time_string)
    datediff(end_date,start_date)
    date_add(string start_date,int days)
    date_sub(string start_date,int days)
    if(condition, true_result, false_result)
    substr(field, start, length)
    get_json_object(json_string, '$.key_name')
    map_field('key') e.g. people_info: {"":"","":""}  people_info('phone_brand')
    coalesce(expression_1, expression_2,...) 常和full join连用，用以合并两个表两列的非空值,如果expression_1为null，则返回expression_2，以此类推

2.示例

有user_info，user_refund，user_trade三张表，分别是用户信息、用户退款、和用户交易表。用户信息表中包含用户性别、所在城市、用户名、以及保存其它信息的两个json字段（分别为map格式与string）。用户交易表包含用户名、交易额、交易时间。用户退款表包含用户名、退款金额、退款日期。

对函数的应用

例1

用户Alice 2018年的平均支付金额以及2018年最大支付日期和最小支付日期的间隔

select avg(pay_amount) as avg_amount,
           datediff(
                max(
                    from_unixtime(pay_time,'yyyy-MM-dd')
                    ),
                min(
                    from_unixtime(pay_time,'yyyy-MM-dd')
                )
           ) 
    from user_trade
    where user_name = 'Alice'
          and year(dt) = '2018'

例2

2018年购买的商品品类在两个以上的用户数

select count(temp.user_name)
    from (
        select user_name, count(distinct goods_category) as num
        from user_trade
        where year(dt) = '2018'
        group by user_name 
        having count(distinct goods_category) > 2
    )  temp

例3

激活时间在2018年，年龄段20-30岁和30-40岁的用户婚姻状况分布

select     temp.age_type,
            if(temp.marriage_status=1, '已婚', '未婚'),
            count(distinct temp.user_id)
    from
    (
        select case when age < 20 then '20岁以下'  
               case when age >=20 and age < 30 then '20-30岁' 
               case when age >=30 and age < 40 then '30-40岁'
               else '40岁以上' end as age_type,
               get_json_object(extra1, '$.marriage_status') as marriage_status,
               user_id
            from user_info
            where to_date(firstactivetime) between '2018-01-01' and '2018-12-31'
    ) temp

连表查询示例

例1

找出在表1不在表2的用户

--hive sql in不能嵌套子查询
select a.user_id, a.user_name
from user_list_1 a 
left join user_list_2 b on a.user_id = b.user_id
where b.user_id is null

例2

追加表2信息到表1

--字段名和字段顺序必须全都一致
--union 连接时会将前后去重且排序，但速度更慢，建议使用union all在内部去哄
select     user_id,
        user_name,
from user_list_1
union all
select user_id,
     user_name,
from user_list_2

例3

2019年每个用户的支付和退款金额汇总

    --解法1，使用union all
    select     temp.user_name,
            sum(temp.pay_amount),
            sum(temp.refund_amount)
    from
    (
    select    user_name, 
            sum(pay_amount),
            0 as refund_amount -- union all需要列名一致，顺序统一，所以以0占位，不影响求和的计算结果
    from user_trade
    where year(dt) = 2019
    group by user_name
    union all
    select     user_name,
            0 as pay_amount,
            sum(refund_amount)
    from user_refund
    where year(dt) = 2019
    group by user_name
    ) temp
    group by temp.user_name
    
    --解法2，上述问题以full join实现，但是较之union all效率要差很多
    --用户可能只在一张表中出现，用coalese可以确保呈现出所有用户的数据
    select     coalesce(a.user_name, b.user_name),
            if(a.pay_amount is null, 0, a.pay_amount),
            if(b.refund_amoutn is null, 0, b.refund_amount)
    from
    (
    select    user_name, 
            sum(pay_amount) as pay_amount
    from user_trade
    where year(dt) = 2019
    group by user_name
    ) a full join
    (
    select     user_name,
            sum(refund_amount) as refund_amount
    from user_refund
    where year(dt) = 2019
    group by user_name
    ) b on a.user_name = b.user_name

例4

首次激活时间在2017年，但是一直没有支付的用户年龄段分布

    select    a.age_level,
            count(a.user_name)
    from
    (
    select     user_name,
            case when age < 20 then '20岁以下'
                 when age >=20 and age<30 then '20-30岁'
                 when age >=30 and age<40 then '30-40岁'
                 else '40岁以上' end as age_level
    from user_info
    where year(firstactivetime) = 2017
    ) a
    left join
    (
    select distinct user_name
    from user_trade
    where year(dt) >= 2017
    ) b on a.user_name = b.user_name
    where b.user_name is null
    group by a.age_level

例5

2018年、2019年交易的用户，其激活时间的分布

    select hour(firstactivetime),
            count(a.user_name)
    (
    select distinct user_name
    from user_trade
    where year(dt) between 2018 and 2019
    ) a
    left join
    (
    select  user_name, 
            firstactivetime
    from user_info
    ) b on a.user_name = b.user_name
    group by hour(firstactivetime)

窗口函数

例1

2018年、2019年每个月的支付总额和截至当月的本年度累计支付总额

    select  a.year,
            a.month,
            a.pay_amount,
            sum(a.pay_amount) over(partition by a.year order by a.month) 
    from
    (
    select year(dt) year
           month(dt) month,
           sum(pay_amount) as pay_amount    
    from user_trade
    where year(dt) in (2018, 2019)
    group by year(dt)
             month(dt) 
    ) a

例2

2018年每个月的近三个月移动平均支付金额

    select  a.month,
            a.pay_amount,
            --x following表示到当前行下面几行
            avg(a.pay_amount) over(order by a.month rows between 2 preceding and current row)
    from
    (
    select month(dt),
           sum(pay_amount) as pay_amount    
    from user_trade
    where year(dt) = 2018
    group by month(dt) 
    ) a

例3

2019年1月，用户购买商品品类数量排名

    select  user_name,
            count(distinct goods_category),
            row_number() over(order by count(distinct goods_category))
    from user_trade
    where substr(dt,1,7) = '2019-01'
    group by user_name

例4

选出2019年支付金额排名在第10，20，30名的用户

    select  a.user_name
            a.rank_num
    from 
    (
    select    user_name,
            sum(pay_amount),
            rank() over(order by sum(pay_amount) desc) as rank_num
    from user_trade
    where year(dt) = 2019
    group by user_name
    ) a
    where a.rank_num in (10,20,30)

例5

将2019年1月的支付用户，按照支付金额分成5组

    --ntile(n) over()
    select     user_name,
            sum(pay_amount) pay_amount,
            ntile(5) over(order by sum(pay_amount) desc) as level 
    from user_trade
    where substr(dt,1,7) = '2019-01'
    group by user_name

例6

偏移分析窗口函数 lag/lead(expression, offset, default)
lag/lead指的是当前行处于lag/lead的状态，即前者是向前偏移，后者是向后推

支付时间间隔超过100天的用户数

    select count(distinct user_name)
    from
    (
    select user_name,
            dt,
            lead(dt) over(partition by user_name order by dt) next_dt
    from user_trade 
    where dt>'0'
    ) a
    where a.lead_dt is not null and datediff(a.next_dt, a.dt) > 100

例7

每个城市、不同性别、2018年支付金额最高的TOP3用户

--4.筛选分组排序的结果    
    select uit.user_name,
           uit.city,
           uit.sex,
           uit.pay_amount,
           uit,ranking
    (
    select    ut.user_name,
            ui.city,
            ui.sex,
            ut.pay_amount,
            --3.有了支付信息和用户信息就开始分组排序
            row_number() over(partition by ui.city, ui.sex order by ut.pay_amount desc) as ranking
    from
    (
    --21取出2018年每个人的支付总额
    select user_name,
           sum(pay_amount) pay_amount
    from user_trade
    where year(dt) = '2018'
    group by user_name
    ) ut
    --2.既然要按照城市和性别分组，就需要与用户信息连接
    left join user_info ui on ut.user_name = ui.user_name
    ) uit     

    where uit.ranking < 4

例8

每个手机品牌退款金额前25%的用户

select * 
    from
        (
            select     ur.user_name,
                    ui.extra2('phonebrand') as phonebrand,
                    ur.refund_amount,
                    ntile(4) over(partition by ui.extra('phonebrand') order by ur.refund_amount) as tile
        from
        (
            select user_name,
                    sum(refund_amount) as refund_amount
            from user_refund
            where dt > '0'
            group by user_name
        ) ur left join user_info ui 
        on ur.user_name = ui.user_name
        ) uri
    where uri.tile = 1

3.建议

查询的筛选条件中必须包含分区字段，因为Hive中表的数据量很大，通常会以某个字段为基准进行分区，避免每次要筛选整张表。
ORDER BY在select之后执行，所以必须使用重命名后的列名（如果有的话）。执行顺序：FROM - WHERE - GROUP BY - HAVING - SELECT - ORDER BY
聚合函数不可嵌套
先去重，再做表连接，尽量缩小数据集，好的sql语句不在意长短，而在于执行效率