Hive学习笔记1

-- 1. hive 建立一张表，跟已经存在的结构化的数据文件产生映射关系
-- 映射成功后，就可以通过写sql来分析这个结构化的数据，避免了写mr程序的麻烦。

-- 2.数据库---》默认与/usr/hive/warehouse 下的文件夹对应
-- 表 ---》数据库文件夹下面的子文件夹 /usr/hive/warehouse/test.db/t_t1
-- 表的数据位置目前不能随便存放一定要在自动挡的数据库表文件夹下面
-- 建立表的时候肯还需要指定分割符，否则又肯映射不成功
create table t_t1(id int,name string) row format delimited fields terminated by ',';

--复杂数据类型建表array,map
-- array data: 1 zhangsan beijing,shanghai,guangzhou
create table t_array(id int,name string, work_location array<string>) row format delimited fields terminated by ' ' collection items terminated by ',';
-- map data: 1,zhangsan,dance:yes-sing:no-shopping:yes
create table t_map(id int, name string, hobby map<string,string>) row format delimited fields terminated by ',' collection items terminated by '-',map keys terminated by ':';

-- 3.建表的时候一定要根据结构化数据文件的分隔符类型，指定分隔符
-- 建表的字段格式和字段类型，要根据结构化数据中的个数类型一致
-- 分隔符一般使用内置的来指定，ROW FORMAT DELIMITED 分割字段，还是分割集合等

-- 4.分区表知道不能够在表中已存在
-- 分区字段是一个虚拟的字段，不存放任何数据
-- 分区字段的数据来自于装载分区表数据的时候指定的

-- 分区表的指定，在hdfs上的效果就是在建立表的文件夹下面又创建了子文件
-- 这样的目的吧数据的划分更加细致，减少了查询时候全表扫描的成本，只需要按照指定的分区扫描数据并显示结果即可
create table user(id int, name string) partitioned by (country string) row format delimited fields terminated by ',';
create table day_hour_table(id int, content string) partitioned by (dt string, hour string) row format delimited fields terminated by ',';

-- 加载数据
load data local inpath '/root/hivedata/user.txt' into table user partitioned by (country:'China');
load data local inpath '/root/hivedata/map.txt' into table t_map;

-- 5.分桶表创建之前需要开启分桶功能
set hive.enforce.bucketing=true;
set mapreduce.job.reduces=4;
-- 分桶表创建的时候分桶字段必须是表中已经存在的字段
-- 也就是说你要按照表中的那个字段进行分开
-- 针对分桶表的数据导入 load data 方式不能狗导成分桶表的数据，没有分桶效果，
-- 原因在于load 本质上相当于hive帮我们执行hadoop fs -put命令
create table stu_buck(Sno int,Sname string, Sex string,Sage int,Sdept string)
clustered by(Sno)
into 4 buckets
row format delimited fields terminated by ',';

--分桶表加载数据,创建一个临时表student,把数据加载到临时表中，在从临时表中把数据插入到分桶表中
create table student(Sno int,Sname string, Sex string,Sage int,Sdept string);
load data local inpath '/root/hivedata/students.txt' into table student;
-- 分桶表的数据采用insert+select插入的数据来自于查询结果（查询时候执行了mr程序）
-- 对应mr当中的partitionor
-- 默认分桶规则，按照指定的分桶字段cluster by哈希值的分桶个数 set mapreduce.job.reduces=?
-- 分桶表也是把表说映射的结构化数据文件分成更细致的部分，但是更多的是用在join查询提供效率之上，
-- 只需要把join的字段在各自表当中进行分桶操作即可
insert overwrite table stu_buck select * from student cluster by(Sno);

--------------------------------------------------------------------------------------------------

-- 6. 内部表、外部表
-- 建内部表
create table student(Sno int,Sname string, Sex string,Sage int,Sdept string) row format delimited fields terminated by ',';
-- 建外部表
create external table ex_student(Sno int,Sname string, Sex string,Sage int,Sdept string) row format delimited fields terminated by ' ' location '/stu';
-- 加载数据:
-- 不需要load data装载，直接把数据文件上传到外部表所在目录，就可以自动映射
-- external关键字可以让用户创建一个外部表，在建表的同时指定一个指向实际数据的路径(location)
-- hive创建内部表时，会将数据移动到数据仓库指向的路径；若创建外部表，仅记录数据所在的路径，不对数据的位置做任何改变。
-- 在删除表的时候，内部表的元数据和数据会被一起删除，而外部表值删除元数据，不会删除数据。

-- 7. 复制表
-- 只复制表结构，不复制数据
create table t_t2_copy like t_t2;

-- 8. 修改表

-- 增加分区
-- //添加一个分区
alter table table_name add partition(dt='20200101') location '/usr/hive/warehouse/table_name/dt='20200101'';
-- //添加多个分区
alter table table_name add partition(dt='2008-08-08', country='us') location '/path/to/us/part080808'
partition(dt='2008-08-09', country='us') location '/path/to/us/part080809';

-- 删除分区
alter table table_name drop if exist partition(dt='2008-08-08');
alter table table_name drop if exist partition(dt='2008-08-08', country='us');

-- 修改分区
alter table table_name partition(dt='2008-08-08') rename to partition(dt='2008-08-09')

-- 添加列
alter table table_name add|replace columns(col_name, string);
-- 修改列
ALTER TABLE test_change CHANGE a a1 INT;
ALTER TABLE test_change CHANGE a a1 string AFTER b;
ALTER TABLE test_change CHANGE b b1 INT FIRST;

-- 表重命名
ALTER table table_name RENAME TO new_table_name;

--9.显示命令
show databases|schemas;
show tables;
show partitions table_name;
show functions;--显示当前版本hive支持的所有方法
desc extended table_name;
desc formatted table_name;
describe database database_name;
---------------------------------------------------------------------------------------------------------------------------------------------------------------
-- DML操作
-- Load
LOAD DATA [LOCAL] INPATH 'filepath' [OVERWRITE] INTO TABLE table_name [PARTITION](partition_name)
-- Insert
-- hive中insert主要是结合select查询语句使用，将查询结果插入到表中
insert overwrite table stu_buck select * from student cluster by(Sno);
-- insert多重插入
create table source_table (id int,name string) row format delimited fields terminated by ',';
create table test_insert1 (id int) row format delimited fields terminated by ',';
create table test_insert2 (name string) row format delimited fields terminated by ',';

from source_table
insert overwrite table test_insert1
select id
insert overwrite table test_insert2
select name;
-----------------------------------------------------------------------------------------------------------------------------------------------------------------
-- 动态分区插入
set hive.exec.dynamic.partition=true; --是否开启动态分区，默认false
set hive.exec.dynamic.partition.mode=nonstrict; --动态分区的模式，默认strict,表示必须指定一个分区为静态分区，nonstrict模式允许所有的分区字段都可以使用动态分区

--需求：将dynamic_partition_table中的数据按照时间(day),插入到目标表d_p_t的相应分区中

-- 原始表：
create table dynamic_partition_table(day string,ip string) row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/dynamic_partition_table.txt' into table dynamic_partition_table;
2015-05-10,ip1
2015-05-13,ip2
2015-05-11,ip3
2015-08-10,ip4
2015-06-10,ip5
2015-07-10,ip6
-- 目标表
create table d_p_t(ip string) partitioned by (month string,day string);
--动态插入
insert overwrite table d_p_t partition(month,day)
select ip,substr(day,1,7) as month,day
from dynamic_partition_table;
--动态分区是通过位置来对应分区值的，原始表select出来的值和输出partition的值的关系仅仅是通过位置来确定的，和名字并没有关系。

-----------------------------------------------------------------------------------------------------------------------------------

--导出表数据
--语法结构
INSERT OVERWRITE[LOCAL] DIRECTORY directory1 SELECT ...FROM...
multiple inserts;
FROM from_statement
INSERT OVERWRITE[LOCAL] DIRECTORY directory1 select_stament1
[INSERT OVERWRITE[LOCAL] DIRECTORY directory1 select_stament2]...
-- 数据写入到文件系统时进行文本序列化，且每列用^A来区分，为换行符

-- 查询结果导入到文件系统
-- 将查询结果保存到指定的文件目录(local|hdfs)
insert overwrite local directory '/root/exportdata' select * from d_p_t;
insert overwrite directory '/home/hadoop/test' select * from d_p_t; (hdfs)

------------------------------------------------------------------------------------------------------------------------------------
-- 分桶、排序等查询:cluster by,sort by,distribute by
select * from student cluster by(Sno);

insert overwrite table stu_buck
select * from student cluster by(Sno) sort by(Sage); --报错，cluster和sort不能共存

-- 对某列解析分桶的同时，，根据另一列进行排序
insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sage asc);
-- 总结：cluster(分且排序，必须一样)== distribute(分)+sort(排序)(可以不一样)
-- 说明:
-- 1.order by 会对输入做全局排序，因此只有一个reducer,会导致当输入规模较大时，需要较长的计算时间
-- 2.sort by 不是全局排序，1其在数据进入reducer前完成排序。因此，如果用sort by进行排序，并且设置
-- mapred.reduce.tasks>1,则sort by只保证每个reducer的输出有序，不保证全局有序
-- 3.distribute by(字段)根据指定字段将数据分到不同的reducer,分发算法是hash散列。
-- 4.cluster by(字段)除了只有distribute by的功能外，还会对该字段进行排序。
-- 如果distribute和sort的字段是同一个时，cluster by=distribute by+sort by
--------------------------------------------------------------------------------------------------------------------------------------

-- Hive join
-- hive中除了支持和传统数据库中一样的内连接，外连接，左右连接，还支持left semi join 和cross join,
-- hive只支持等值连接（a.id=b.id）

-- join练习
create table a(id int,name string) row format delimited fields terminated by ',';
create table b(id int,name string) row format delimited fields terminated by ',';
load data local inpath '/root/hivedata/a.txt' into table a;
load data local inpath '/root/hivedata/b.txt' into table b;

select * from a inner join b on a.id=b.id;
select * from a left join b on a.id=b.id;
select * from b right join a on a.id=b.id;
select * from a full outer join b on a.id=b.id;

--left semi join
select * from a left semi join b on a.id=b.id;
--相当于
select a.id,a.name from a where a.id in (select b.id from b);--在hive中效率极低
select a.id,a.name from a join b on a.id=b.id;
select * from a inner join b on a.id=b.id;
--cross join(##慎用)
--返回两个表的笛卡儿积结果，不需要指定关联键
select a.*,b.* from a cross join b;

-----------------------------------------------------------------------------------------------------------------------
-- hive参数配置
-- 输入$HIVE_HOME/bin/hive -H或者-help
-- -i: 初始化HQL文件
-- -e：从命令行执行指定的HQL
-- -f:执行HQL脚本
-- -v:输出执行的HQL语句到控制台
-- -p <port> connect to hive server on port number
-- -hiveconf x=y Use this to set hive/hadoop configuration variables
$HIVE_HOME/bin/hive -e 'select * from a'
$HIVE_HOME/bin/hive -f /home/my/hive-script.sql

--对于一般的参数，有三种设定方式：
-- 配置文件：全局有效 hive-defalut.xml, hive-site.xml
-- 命令行参数：对hive启动实例有效 $HIVE_HOME/bin/hive -e 'select * from a'
-- 参数声明：对hive连接的session有效 set hive.exec.dynamic.partition=true;

------------------------------------------------------------------------------------------------------------------------
-- hive特殊分隔符
create table t_bi_reg(id int,name string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties(
'input.regex'='(.*)\|\|(.*)'
'output.format.string'='%1$s %2$s'
)stored as textfile;