Hive sql

GROUP BY 语句
    合计函数 (比如 SUM) 常常需要添加 GROUP BY 语句。
    GROUP BY 语句用于结合合计函数,根据一个或多个列对结果集进行分组。


HAVING 子句
    在 SQL 中增加 HAVING 子句原因是,WHERE 关键字无法与合计函数一起使用。
WHERE子句:在分组之前使用,表示从所有数据中筛选出部分数据,已完成分组的要求,在WHERE子句中不允许使用统计函数,没有GROUP BY子句也可以使用。

HAVING子句:实在分组之后使用的,表示对分组统计后的数据执行再次过滤,可以使用统计函数,有GROUP BY子句之后才可以出现HAVING子句。
HAVING 使用统计函数后,select 也必须 筛选这个统计函数。

SELECT * FROM table GROUP BY x HAVING * 
desc function extended from_unixtime; 

日期函数

from_unixtime 
    将时间戳变为指定格式的日期
    

unix_timestamp
    将时间转换为时间戳
    
    
datediff(d1,d2) 
    计算日期间隔 d1-d2
    
    
to_date
    将长日期转化为短日期
    将年月日 时分秒 转换为 去掉时分秒只有年月日的格式。
    
    
year(dt)

    dt的格式为 yyyy-MM-dd
    另一种方法 dt BETWEEN '' AND '',很明显,直接用year更简单。
    
date_add


date_sub



条件函数

CASE WHEN
    语法:CASE WHEN 'x1' THEN 'y1'
               WHEN 'x2' THEN 'y2'
               ELSE 'y3'
           END
           
    
    SELECT CASE WHEN age<20 THEN '20岁以下'
    WHEN age BETWEEN 20 AND 30 THEN '20-30岁'
    ELSE '其它岁数' END,count(distinct user_id)
    FROM user_info
    GROUP BY CASE WHEN age<20 THEN '20岁以下'
    WHEN age BETWEEN 20 AND 30 THEN '20-30岁'
    ELSE '其它岁数' END ;

    这个例子说明了CASE WHEN 可以作为group by的条件。作为group by的参数,实际上是将某一列的数据进一步细分了。
    
    
    
if 
    语法: if(条件,条件True的值,条件False的值)
    
    SELECT sex,if(level>5,'高级','低级'),count(user_id)
    FROM user_info
    GROUP BY sex,if(level>5,'高级','低级');
    
    这里的应用也是作为group by的一个条件。level本身就是数据的某一列。
    
    
    
    
    
字符串函数
    
    substr 字符串截取
    
    
    
    
    
特殊类型处理

    extra1  string,
    extra2  map<string,string>)
        从数据库中读出来都是字典样式,但是取值的方法却由于类型的不同而不同。
        
        extra1     
            get_json_object(extra1,'$.phonebrand')
                注:get_json_object(需要解析的json字段,用$.key取出想要获取的value)
        extra2 
            extra2['phonebrand']

练手的HQL

SELECT a.user_name FROM (SELECT DISTINCT user_name FROM user_trade WHERE year(dt)=2017)a 
    INNER JOIN (SELECT DISTINCT user_name FROM user_trade WHERE year(dt)=2018)b
    ON a.user_name=b.user_name;
    1 使用内置函数,对分区字段取值。
    2 表需要重命名
    3 一个等号
    
SELECT a.user_name FROM (SELECT user_name FROM trade_2017)a
    INNER JOIN (SELECT user_name FROM trade_2018)b ON a.user_name=b.user_name
    INNER JOIN (SELECT user_name FROM trade_2019)c ON b.user_name=c.user_name;
    1 多张子表关联,注意格式。

SELECT b.name FROM (SELECT distinct user_name from list1)a left join 
(SELECT distinct user_name from list2)b on a.user_name=b.name where b.name is null;
    1 判断是否为null,用is,不是=


SELECT b. education,count(b.user_name) 
FROM (SELECT DISTINCT user_name FROM user_trade WHERE year(dt)=2019)a 
INNER JOIN (SELECT DISTINCT user_name,extra2[‘education’] as education FROM user_info) b 
ON a.user_name=b.user_name 
GROUP BY b.education;


SELECT c.user_name FROM (SELECT DISTINCT user_name FROM trade_2017)a
INNER JOIN (SELECT DISTINCT user_name FROM trade_2018)b 
    ON a.user_name=b.user_name
    LEFT JOIN (SELECT DISTINCT user_name FROM trade_2019)c
    ON b.user_name=c.user_name 
WHERE c.user_name is null;
    


UNION ALL
    不去重不排序
    1 需要对表重命名。
    2 需要筛选后,才能UNION

    SELECT DISTINCT c.user_name,COUNT(c.user_name)
FROM (SELECT DISTINCT user_name FROM trade_2017
UNION ALL
SELECT DISTINCT user_name FROM trade_2018)c;
UNION 
    去重排序





SELECT a.user_name,IF(a.sum_pay is null,0,a,sum_pay),if(b.sum_refund is null,0,b.sum_refund)
FROM 
 (SELECT user_name,SUM(pay_amount) as sum_pay FROM user_trade WHERE year(dt)=2017 GROUP BY user_name)a 
FULL JOIN 
(SELECT user_name,SUM(refund_amount) as sum_refund FROM user_refund WHERE year(dt)=2017 GROUP BY user_name)b
ON a.user_name=b.user_name
GROUP BY a.user_name 





SELECT a.user_name,a.sum_pay,if(b.sum_refund is null,0,b.sum_refund)
FROM
(SELECT user_name,SUM(pay_amount) as sum_pay FROM user_trade WHERE year(dt)=2017 GROUP BY user_name)a
LEFT JOIN
(SELECT user_name,SUM(refund_amount) as sum_refund FROM user_refund WHERE year(dt)=2017 GROUP BY user_name)b
ON a.user_name=b.user_name



SELECT     CASE
        WHEN age<20 THEN ‘青年’
        WHEN age BETWEEN 20 AND 60 THEN ‘中年’
        ELSE ‘老年’
    END,COUNT(a,user_name)
FROM 
(SELECT DISTINCT user_name,age FROM user_info WHERE year(firstactivetime)=2017) a
JOIN
(SELECT DISTINCT user_name,pay_amount FROM user_trade) b
ON a.user_name=b.user_name 
WHERE b.pay_amount is 0
GROUP BY 
    CASE
        WHEN age<20 THEN ‘青年’
        WHEN age BETWEEN 20 AND 60 THEN ‘中年’
        ELSE ‘老年’
    END
备注:CASE END 也可以作为一个变量名


SELECT     a.age_level,COUNT(a,user_name)
FROM 
(SELECT DISTINCT user_name, 
CASE
        WHEN age<20 THEN ‘青年’
        WHEN age BETWEEN 20 AND 60 THEN ‘中年’
        ELSE ‘老年’
    END AS age_level                          -->  学完Tableau之后,这个就太容易理解了
 FROM user_info WHERE year(firstactivetime)=2017) a
JOIN
(SELECT DISTINCT user_name,pay_amount FROM user_trade) b
ON a.user_name=b.user_name 
WHERE b.pay_amount is 0
GROUP BY a.age_level;



SELECT hour(firstactivetime),COUNT(DISTINCT a.user_name)
FROM 
(SELECT DISTINCT user_name FROM trade_2017 
UNION ALL
SELECT DISTINCT user_name FROM trade_2018) a
LEFT JOIN
(SELECT DISTINCT user_name,firstactivetime FROM user_info)b
ON a.user_name=b.user_name 
GROUP BY hour(firstactivetime);


SELECT b.sex,count(b.user_name) FROM
(SELECT user_name FROM user_trade WHERE year(dt)=2019
UNION
SELECT user_name FROM user_refund WHERE year(dt)=2019)a
JOIN
(SELECT distinct user_name,sex FROM user_info)b ON a.user_name=b.user_name
GROUP BY b.sex;



SELECT d.city,count(d.user_name)
FROM
(SELECT a.user_name FROM
(SELECT distinct user_name FROM user_trade WHERE year(dt)=2018)a
LEFT JOIN
(SELECT distinct user_name,pay_amount FROM user_trade WHERE year(dt)=2019)b
ON a.user_name=b.user_name WHERE b.pay_amount is null)c
JOIN (SELECT distinct user_name,city FROM user_info)d
ON c.user_name=d.user_name GROUP BY d.city;




SELECT d.phone,count(d.user_name)
FROM
(SELECT a.user_name FROM
(SELECT distinct user_name FROM trade_2017
UNION ALL
SELECT distinct user_name FROM trade_2018
UNION ALL
SELECT distinct user_name FROM trade_2019)a
LEFT JOIN
(SELECT distinct user_name,refund_amount FROM user_refund WHERE dt>'0')b
ON a.user_name=b.user_name where b.refund_amount is null)c
JOIN
(SELECT distinct user_name,extra2['phonebrand'] as phone FROM user_info)d
on c.user_name=d.user_name
GROUP BY d.phone;

 
原文地址:https://www.cnblogs.com/654321cc/p/12175562.html