Hive常用函数 傻瓜学习笔记 附完整示例

  • 创建表

drop table if exists mydatabase.test;
create table mydatabase.test
    (id int, name string, timestring string, salary double, bonus double)
    row format delimited
    fields terminated by ' '
    stored as textfile;

  • 插入数据

方式1
vim test.txt
hadoop fs -mkdir test
hadoop fs -put test.txt /user/myname/test
load data inpath '/user/myname/test'
    overwrite into table mydatabase.test;
方式2
insert into mydatabase.test values
    (1,' J ','2018-01-08 10:11:32',128.54,-45.23),
    (2,' J ','2018-02-09 10:51:12',128.54,-78.25),
    (3,' J ','2018-03-05 11:22:21',128.52,null),
    (4,' J ','2018-04-08 15:40:51',256.23,345.23),
    (5,' J ','2018-05-08 10:21:21',128.54,267.12),
    (6,' J ','2018-06-08 10:00:50',256.27,-78.49),
    (7,'Rose','2018-01-08 10:11:32',512.65,-76.44),
    (8,'Rose','2018-02-09 10:51:12',512.54,-45.30),
    (9,'Rose','2018-03-05 11:22:21',512.13,-87.09),
    (10,'Rose','2018-04-08 15:40:51',512.34,19.12),
    (11,'Dickson','2018-01-08 10:21:21',256.87,null),
    (12,'Dickson','2018-02-08 10:00:50',256.52,null),
    (13,'Dickson','2018-04-08 11:00:00',256.12,3.69);

  • 数学函数

四舍五入
select id, round(salary) from mydatabase.test;
四舍五入,小数保留
select id, round(salary, 1) from mydatabase.test;
向下,向上取整
select id, floor(salary), ceil(salary) from mydatabase.test;
随机数(0~1)
select id, salary*(1+rand()*0.1) from mydatabase.test;
指数,对数,取模
select id, pow(e(), salary), log(e(), salary), pmod(id, 3) from mydatabase.test;
绝对值,最大值,最小值
select id, abs(bonus), greatest(salary, bonus), least(salary, bonus) from mydatabase.test;

  • 类型转换函数

select id, cast(salary as int) from mydatabase.test;

  • 日期函数

当前时间
select id, name, unix_timestamp() from mydatabase.test;
时间戳转换为字符串
select id, name, from_unixtime(unix_timestamp(), 'yyyy-MM-dd hh:mm:ss') from mydatabase.test;
字符串转换为时间戳
select id, name, unix_timestamp('2019-02-13 11:22:33') from mydatabase.test;
字符串转换为时间戳
select id, name, unix_timestamp('20190213 11:22:33', 'yyyyMMdd HH:mm:ss') from mydatabase.test;
时间子元素
select id, name, to_date(timestring), year(timestring), month(timestring), day(timestring), hour(timestring), minute(timestring), second(timestring) from mydatabase.test;

  • 条件函数

IF条件
select id, if(bonus > 0, 'yes', 'no') from mydatabase.test;
NULL判断
select id, isnull(bonus) from mydatabase.test;
NULL条件,第二参数为默认值
select id, nvl(bonus, 0) from mydatabase.test;
非空查找函数
select id, coalesce(bonus, 0, null) from mydatabase.test;
CASE匹配条件
select id, name,
    (case name
    when 'Jack' then 'A'
    when 'Rose' then 'B'
    else 'C'
    end)
    from mydatabase.test;
CASE搜索条件
select id, salary, bonus,
    (case
    when salary > 500 then 'A'
    when salary > 100 and bonus > 0 then 'B'
    else 'C'
    end)
    from mydatabase.test;

  • 聚合函数

去重
select distinct(name) from mydatabase.test;
计数
select count(*) from mydatabase.test;
条件计数
select count(bonus > 0) from mydatabase.test;
求和,求平均,最大,最小,方差
select name, sum(salary), avg(salary), min(salary), max(salary), variance(salary) from mydatabase.test group by name;
生成列表
select name, collect_list(salary) from mydatabase.test group by name;
生成非重列表
select name, collect_set(salary) from mydatabase.test group by name;

  • 字符串函数

长度
select name, length(name) from mydatabase.test;
查找
select name, locate('o', name) from mydatabase.test;
左填充,右填充
select name, lpad(name, 4, '_'), rpad(name, 4, '_') from mydatabase.test;
去除左空格,去除右空格,去除左右空格
select name, ltrim(name), rtrim(name), trim(name) from mydatabase.test;
字符距离
select n1, n2, levenshtein(n1, n2) from
  (select distinct(name) as n1 from mydatabase.test)db0
  join
  (select distinct(name) as n2 from mydatabase.test)db1
  on n1 != n2;
分割
select name, split(timestring, '-'), size(split(timestring, '-')) from mydatabase.test;
分列
select name, timesplit from from mydatabase.test lateral view explode(split(timestring, '-')) s as timesplit;
子字符串
select substr(name, 1, -1), substr(timestring, -8) from mydatabase.test;
替换(注意转义替换可能需要四个斜杆)
select regexp_replace(timestring, '\d+-\d+-\d+', '###') from mydatabase.test;
提取(注意转义替换可能需要四个斜杆)
select regexp_extract(timestring, '\d+', 1) from mydatabase.test;
拼接
select name, concat(year(timestring), '|', month(timestring), '|', cast(salary as string)) from mydatabase.test;
拼接列表
select name,
  concat_ws('|',
  collect_list(cast(salary as string))
  ) from mydatabase.test group by name;
拼接有序列表
select name,
  concat_ws('|',
  sort_array(
  collect_list(cast(salary as string))
  )) from mydatabase.test group by name;
拼接有序列表并去除排序因子(注意转义替换可能需要两个/四个斜杆)
select regexp_replace(
  concat_ws('|',
  sort_array(
  collect_list(
  concat(lpad(cast(rank as string), 3, '0'), ':', salary)
  ))), '\d+:', '') as lst
  from (select name, salary, row_number() over(order by salary) as rank from mydatabase.test) db0;

  • 生成函数

EXPLODE
select id, part from mydatabase.test lateral view explode(split(timestring,' ')) t as part;

  • 选择函数

IN
select id, name from mydatabase.test where name in('Dickson', 'Rose');

  • 分组函数1

ROW_NUMBER
select id, name, salary, row_number() over(partition by name order by salary desc) rank from mydatabase.test;
RANK
select id, name, salary, rank() over(partition by name order by salary desc) rank from mydatabase.test;
DENSE_RANK
select id, name, salary, dense_rank() over(partition by name order by salary desc) rank from mydatabase.test;
SUM
select_id, name, sum(salary) over(partition by name order by timestring asc) sum_salary from mydatabase.test;
BEFORE
select id, name, salary, lag(salary, 1) over(partition by name order by timestring asc) before_salary from mydatabase.test;
AFTER
select id, name, salary, lead(salary, 1) over(partition by name order by timestring asc) after_salary from mydatabase.test;

  • 分组函数2

GROUPING SETS
select month, day, sum(salary) from
    (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
    group by month, day grouping sets(month, (month, day)) order by month asc, day asc;
CUBE
select month, day, sum(salary) from
    (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
    group by month, day with cube order by month asc, day asc;
ROLLUP
select month, day, sum(salary) from
    (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
    group by month, day with rollup order by month asc, day asc;

参考文献:

https://www.cnblogs.com/MOBIN/p/5618747.html#7

https://blog.csdn.net/zhanaolu4821/article/details/81871041

https://baijiahao.baidu.com/s?id=1613382585734336695&wfr=spider&for=pc

https://blog.csdn.net/guodong2k/article/details/79459282

https://www.cnblogs.com/zhaohz/p/4672943.html

https://www.cnblogs.com/Allen-rg/p/9268627.html

原文地址:https://www.cnblogs.com/jhc888007/p/11085012.html