2、函数

时间函数

当前日期

select current_date();
 2018-11-14

当前时间戳

select current_timestamp();
2018-11-14 21:35:16.237

date_format()

select date_format(current_date(),'yyyyMMdd');
20181114 select date_format(current_timestamp(),'yyyyMMdd');
20181114

unix_timestamp()

select unix_timestamp();
+-----------------------------------------------------------+--+
| unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)  |
+-----------------------------------------------------------+--+
| 1542202845                                                |
+-----------------------------------------------------------+--+

from_unixtime()

select from_unixtime(unix_timestamp(),'yyyyMMdd HH:mm:ss');
+---------------------------------------------------------------------------------------------
| from_unixtime(unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss), yyyyMMdd HH:mm:ss)  
+---------------------------------------------------------------------------------------------
| 20181114 21:43:14

日期差值

0: jdbc:hive2://s101:10000/lx> select datediff('2019-08-02','2019-08-06');
+---------------------------------------------------------------+--+
| datediff(CAST(2019-08-02 AS DATE), CAST(2019-08-06 AS DATE))  |
+---------------------------------------------------------------+--+
| -4                                                            |
+---------------------------------------------------------------+--+

字符串函数

split

select explode(split('hello',''));

substr

select substr('hello',1,3);

trim去除前后空格

select trim(' hello ');

format_number

select format_number(1234.345,1);

concat

length

条件语句

select if(w1 >2 ,w1,w2) from www;
true则w1，false则w2

case when then

select case when w1 > 2 then w1 when w1 <= 2 then w2 end from www;

窗口函数

lead

lead(input[, offset[, default]]) 上提
    input：     上提列
    offset： 上提行数，可选，默认是 1 行
    default：填充值，可选，默认是null
    使用：select id,name,lead(id,2,'qq')over(partition by id order by id) lad from www;
    注意：2可选，不写默认1
          ‘qq’可选，不写默认null,类型需要对应，此处id是int，所以还是为null
          partition by id：可选，分组后对每个组进行lead
          order by id：必须写

lag

lag(input[, offset[, default]]) 下拉
    input：     下拉列
    offset： 下拉行数，可选，默认是 1 行
    default：填充值，可选，默认是null
    使用：select id,name,lag(id,2,11)over(partition by id order by id) lag from www;
    注意：2可选，不写默认1
          11可选，不写默认null
          partition by id：可选，分组后对每个组进行lag
          order by id：必须写

first_value

first_value(expr[, isIgnoreNull])
    expr：列名或一个表达式
    isIgnoreNull：true或false，如果是true将跳过null值，可选，默认false
    select id,name,first_value(concat(cast(id as string), name),true)over(partition by name order by id) lag from www;

last_value

select id,name,last_value(concat(cast(id as string), name),true)over(order by id ) lag from www;
+-----+-------+--------+--+
| id  | name  |  lag   |
+-----+-------+--------+--+
| 1   | a     | 1a     |
| 2   | b     | 2b     |
| 3   | c     | 3c     |
| 4   | c     | 4c     |
| 5   | c     | 5c     |
| 6   | d     | 6d     |
| 7   | b     | 7b     |
| 8   | a     | 8a     |
| 9   | a     | 9a     |
| 12  | eee   | 12eee  |
+-----+-------+--------+--+select id,name from www;
+-----+-------+--+
| id  | name  |
+-----+-------+--+
| 12  | eee   |
| 1   | a     |
| 2   | b     |
| 3   | c     |
| 4   | c     |
| 5   | c     |
| 6   | d     |
| 7   | b     |
| 8   | a     |
| 9   | a     |
+-----+-------+--+
select id,name,last_value(concat(cast(id as string), name),true)over() lag from www;
+-----+-------+------+--+
| id  | name  | lag  |
+-----+-------+------+--+
| 12  | eee   | 6d   |
| 7   | b     | 6d   |
| 8   | a     | 6d   |
| 9   | a     | 6d   |
| 1   | a     | 6d   |
| 2   | b     | 6d   |
| 3   | c     | 6d   |
| 4   | c     | 6d   |
| 5   | c     | 6d   |
| 6   | d     | 6d   |
+-----+-------+------+--+

over和标准聚合函数

select distinct name,count(name)over(partition by name) s from www;
求分区个数并去重
select distinct name,sum(id)over(partition by name) s from www;
分区id和并去重
select id,name,max(length(name))over() from www;
总体的最大长度
select name,min(id)over(partition by name) s from www;
每个分区最小id
select name,max(id)over(partition by name) s from www;
每个分区最大id
select name,avg(id)over(partition by name) s from www;
每个分区平均id

over和partiton by

over和partition by order by

select first_value(id)over(partition by id,name) from www;
select first_value(id)over(partition by id,name order by id,name) from www;

以行限定窗口范围

select * ,sum(grade)over(order by grade desc rows between current row and 1 following) from sg;

select * ,sum(grade)over(order by grade desc rows between current row and unbounded following) from sg;

select * ,sum(grade)over(order by grade desc rows between unbounded preceding and current row) from sg;

select * ,sum(grade)over(order by grade desc rows between unbounded preceding and unbounded following) from sg;

以值限定窗口范围

select * ,sum(grade)over(order by grade desc range between unbounded preceding and current row) from sg;

排名函数

rank（）：并列跳跃

dense_rank()：并列连续，不跳跃

row_number（）：连续

Assigns a unique,sequential number to each row, 
starting with one,
according to the ordering of rows within the window partition

//商家内用户访问次数倒序排列，取前三个
select * from (
    select * ,row_number()over(partition by id order by count desc)b from (
        select id,uu,count(*) as count from shangjia where uu is not null group by id,uu)a)c 
where b <= 3;

cume_dist（）

select cume_dist()over(order by mid ) from t1;
//小于等于当前值的行数/分组内总行数    
select cume_dist()over(order by mid desc) from t1;
//大于等于当前值的行数/分组内总行数

percent_rank

ntile（n）：每个分区按一定顺序分成n份

高级聚合函数

grouping sets

select *,count(1),grouping_id（） from pv group by t1,t2,s3 grouping sets(t1,t2,s3);

grouping__id    //分组的组号（可选）
grouping sets(t1,t2,s3,()) //相当于分别对t1,t2,s3,null进行分组并用union all连接

group by ... with cube

select *,count(1),grouping_id() from pv group by t1,t2,s3 with cube order by grouping_id();
八种

rollup

select *,count(1),grouping_id() from pv group by t1,t2,s3 with rollup order by grouping_id();
null、t1、t1和t2、t1和t2和t3

排序函数

order by age　　

全排序，一个reduce；需要加limit，在map阶段在每个分区中取出前n个元素，交给r处理

sort by age

部分排序

distribute by age

哈希分区

cluster by

distribute by + sort by

select age from user_order distribute by age sort by age;

渐变 --> 突变