hive函数

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF

重点学习的是字符串的操作，所以建立一个空表就可以了，

create table dual(id string);

load 一个文件dual.dat 只有一个空格

load data local inpath '/home/hadoop/study/study2/dual.dat' overwrite into table dual;

1: jdbc:hive2://localhost:10000> select 2*33 from dual;
+------+--+
| _c0  |
+------+--+
| 66   |
+------+--+

SHOW FUNCTIONS;
DESCRIBE FUNCTION <function_name>;
DESCRIBE FUNCTION EXTENDED <function_name> 显示函数的扩展信息

1: jdbc:hive2://localhost:10000> select 22=23 from dual;
+--------+--+
|  _c0   |
+--------+--+
| false  |
+--------+--+
1 row selected (0.717 seconds)
1: jdbc:hive2://localhost:10000> select 22!=23 from dual;
+-------+--+
|  _c0  |
+-------+--+
| true  |
+-------+--+

0: jdbc:hive2://localhost:10000> select 'aaac' rlike  'a'  from dual; 
+-------+--+
|  _c0  |
+-------+--+
| true  |
+-------+--+
1 row selected (1.138 seconds)
0: jdbc:hive2://localhost:10000> select 'aaac' rlike  'b'  from dual; 
+--------+--+
|  _c0   |
+--------+--+
| false  |
+--------+--+

0: jdbc:hive2://localhost:10000> select 'aaac' rlike null from dual;
+-------+--+
| _c0 |
+-------+--+
| NULL |
+-------+--+
1 row selected (0.276 seconds)
0: jdbc:hive2://localhost:10000> select null rlike 'ss' from dual;
+-------+--+
| _c0 |
+-------+--+
| NULL |
+-------+--+


如果A或B为NULL，则为NULL，如果A的任何（可能为空）子字符串与Java正则表达式B匹配，则为TRUE，否则为FALSE。例如，'foobar'RLIKE'foo'评估为TRUE，“foobar”RLIKE'^ f。* r $'也是如此。

0: jdbc:hive2://localhost:10000> select concat('hello',' kitty') from dual; 
+--------------+--+
|     _c0      |
+--------------+--+
| hello kitty  |
+--------------+--+

0: jdbc:hive2://localhost:10000> select round(2.3333) from dual;
+------+--+
| _c0 |
+------+--+
| 2.0 |
+------+--+

0: jdbc:hive2://localhost:10000> select negative(2.3333) from dual;
+----------+--+
| _c0 |
+----------+--+
| -2.3333 |
+----------+--+

0: jdbc:hive2://localhost:10000> select ascii('A')  from dual; 
+------+--+
| _c0  |
+------+--+
| 65   |
+------+--+

0: jdbc:hive2://localhost:10000> select find_in_set('ab','abc,s,ab,c,cf')from dual; 
+------+--+
| _c0  |
+------+--+
| 3    |
+------+--+
返回strList中str的首次出现，其中strList是以逗号分隔的字符串。如果任一参数为空，则返回null。如果第一个参数包含任何逗号，则返回0。例如，find_in_set（'ab'，'abc，b，ab，c，def'）返回3。

0: jdbc:hive2://localhost:10000> select cast(23.098 as int) from dual; 
+------+--+
| _c0  |
+------+--+
| 23   |
+------+--+

0: jdbc:hive2://localhost:10000> select if(2>1,'yes','no') from dual;
+------+--+
| _c0 |
+------+--+
| yes |
+------+--+

0: jdbc:hive2://localhost:10000> select get_json_object('{"id":"1","name":"oo"}','$.name');
+------+--+
| _c0 |
+------+--+
| oo |
+------+--+

注意第一个参数是一个json对象，后面的一个$.name ,取json对象中的值，第一个参数可以提前设置

例如

set hivevar:msg={ "message":"2015/12/08 09:14:4", "client": "10.108.24.253", "server": "passport.suning.com", "request": "POST /ids/needVerifyCode HTTP/1.1", "server": "passport.sing.co", "version":"1", "timestamp":"2015-12-08T01:14:43.273Z", "type":"B2C","center":"JSZC", "system":"WAF","clientip":"192.168.61.4", "host":"wafprdweb03", "path":"/usr/local/logs/waf.error.log", "redis":"192.168.24.46"}

select get_json_object(‘${hivevar:msg}’,’$.server’)

0: jdbc:hive2://localhost:10000> select parse_url('http://www.baidu.com/p/wew.jsp?id=2&name=wee','QUERY') from dual limit 1 ;
+----------------+--+
|      _c0       |
+----------------+--+
| id=2&name=wee  |
+----------------+--+
从URL返回指定的部分。 partToExtract的有效值包括HOST，PATH，QUERY，REF，PROTOCOL，AUTHORITY，FILE和USERINFO。
例如，parse_url（'http://facebook.com/path1/p.php?k1=v1
这个函数只能查询一个partToExtract的有效值，查询多个时返回null

0: jdbc:hive2://localhost:10000> select parse_url_tuple('http://www.baidu.com/p/wew.jsp?id=2&name=wee','HOST','QUERY') from dual limit 1 ;
+----------------+----------------+--+
| c0 | c1 |
+----------------+----------------+--+
| www.baidu.com | id=2&name=wee |
+----------------+----------------+--+
1 row selected (0.56 seconds)

可以查询多个partToExtract的有效值

0: jdbc:hive2://localhost:10000> select concat_ws('-','2017','05','06') from dual;
+-------------+--+
|     _c0     |
+-------------+--+
| 2017-05-06  |
+-------------+--+
与concat的区别在于这个方法可以指定连接符

0: jdbc:hive2://localhost:10000> select collect_set(name) from myexternaltable; 
返回的是经过去重的数组

.......(mapreduce)
+------------------------------------------------------------------------------------------------------------------+--+
|                                                       _c0                                                        |
+------------------------------------------------------------------------------------------------------------------+--+
| ["oo","pp","ll","i9i","kkj","ujn","aa","zx","sdfa","4sad","3d3","sadf","gdh","asdf4","asdfsadf","asdf","asddd"]  |

0: jdbc:hive2://localhost:10000> select collect_list(name) from myexternaltable; 列出字段的所有值，列出来不去重

窗口函数

https://cwiki.apache.org/confluence/display/Hive/LanguageManual+WindowingAndAnalytics

hive 处理json相关的函数split，可以结合自定义函数解析复杂的json，带有嵌套的

get_json_object也可以解析这种json

split(string str, string pat)
Splits str around pat (pat is a regular expression).

 create table t_rating as 
 select split(parseJson(line),'	')[0] as movieid,
 split(parseJson(line),'	')[1] as rate,
 split(parseJson(line),'	')[2] as timestring,
 split(parseJson(line),'	')[3] as uid from t_json  limit 10;

get_json_object
A limited version of JSONPath is supported:
$ : Root object
. : Child operator
[] : Subscript operator for array
* : Wildcard for []
Syntax not supported that's worth noticing:
: Zero length string as key
.. : Recursive descent
@ : Current object/element
() : Script expression
?() : Filter (script) expression.
[,] : Union operator
[start:end.step] : array slice operator
Example: src_json table is a single column (json), single row table:
+----+
                               json
+----+
{"store":
  {"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}],
   "bicycle":{"price":19.95,"color":"red"}
  },
 "email":"amy@only_for_json_udf_test.net",
 "owner":"amy"
}
+----+
The fields of the json object can be extracted using these queries:
hive> SELECT get_json_object(src_json.json, '$.owner') FROM src_json;
amy
 
hive> SELECT get_json_object(src_json.json, '$.store.fruit[0]') FROM src_json;
{"weight":8,"type":"apple"}
 
hive> SELECT get_json_object(src_json.json, '$.non_exist_key') FROM src_json;
NULL
Built-in Aggregate Functions (UDAF)

transform:调用脚本来解析数据

https://cwiki.apache.org//confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-parse_url_tuple

LATERAL VIEW 行变列.

parse_url_tuple 解析URL

parse_url_tuple
The parse_url_tuple() UDTF is similar to parse_url(), but can extract multiple parts of a given URL, returning the data in a tuple. Values for a particular key in QUERY can be extracted by appending a colon and the key to the partToExtract argument, for example, parse_url_tuple('http://facebook.com/path1/p.php?k1=v1&k2=v2#Ref1', 'QUERY:k1', 'QUERY:k2') returns a tuple with values of 'v1','v2'. This is more efficient than calling parse_url() multiple times. All the input parameters and output column types are string.
SELECT b.*
FROM src LATERAL VIEW parse_url_tuple(fullurl, 'HOST', 'PATH', 'QUERY', 'QUERY:id') b as host, path, query, query_id LIMIT 1;