HIve sql的练习

一、hive sql 的练习

1)数据结构

    字段	            备注                       详细描述
  video id         视频唯一id(String)      11位字符串
  uploader         视频上传者(String)      上传视频的用户名String
  age              视频年龄(int)           视频在平台上的整数天
  category         视频类别(Array<String>) 上传视频指定的视频分类
  length           视频长度(Int)            整形数字标识的视频长度
  views            观看次数(Int)            视频被浏览的次数
  rate             视频评分(Double)         满分5分
  Ratings          流量(Int)               视频的流量,整型数字
  conments         评论数(Int)             一个视频的整数评论数
  related ids      相关视频id(Array<String>) 相关视频的id,最多20个

2)准备工作

 对数据创建外部表,数据存放在hdfs的/gulivideo/video目录下
 create external table video_ori(
videoId string, 
uploader string, 
age int, 
category array<string>, 
length int, 
views int, 
rate float, 
ratings int, 
comments int,
relatedId array<string>)
row format delimited fields terminated by "	"
collection items terminated by "&"
location '/gulivideo/video';
一般不会在文本文件上直接处理数据,一般会把我们建好的外部表的数据导入到内部表中 
创建内部表
create table video_orc stored as orc tblproperties("orc.compress"="SNAPPY") as select * from video_ori;

3)统计视频观看数Top10

  思路:使用order by按照views字段做一个全局排序即可,同时我们设置只显示前10条。
  select
    videoId,
    views
  from
    video_orc
  order by
    views desc
  limit 10;

4)统计视频类别热度Top10

 --1. 将类别拆开  因为 类别是数组类型的
  select
    videoId,
    categories,
    views
  from
    video_orc
  lateral view
    explode(category) tbl as categories;

  --2. 按照类别统计观看数综合,并排序取Top10
  select
    categories,
    sum(views) hot
  from
    t1
  goup by
    categories
  order by
    hot desc
  limit 10;
  方法二:
  select
    categories,
    sum(views) hot
  from
    video_orc
  lateral view
    explode(category) tbl as categories
  group by
    categories
  order by
    hot desc
  limit 10;

5)统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数

 --1. 求Top20视频的类别
  select
    videoId,
    category,
     views
  from
    video_orc
  order by
    views desc
  limit 20;

  --2. 将类别炸开
  select
    videoId,
    categories
  from
    t1
  lateral view explode(category) tbl as categories;

  --3. 统计个数
  select
    categories,
    count(videoId) cnt
  from
    t2
  group by
    categories
  order by cnt desc (可以省略这句);

6) 统计视频观看数Top50所关联视频的所属类别热度排序

 --1. 统计Top50观看的视频以及其关联视频
  select
    videoId,
    views,
    relatedid
  from
    video_orc
  order by
    views desc
  limit 50;

  --2. 将关联视频炸开
  select
    explode(relatedid) videoId
  from
    t1;

  --3. 将t2和video_orc表join,获取其种类
  select
    v.videoId,
    v.category
  from
    video_orc v
  join
    t2
  on v.videoId = t2.videoId;

  --4. 炸开类别
  select
    explode(category) categories
  from
    t3

  --类别热度表
  select
    categories,
    sum(views) hot
  from
    video_orc lateral view explode(category) tbl as categories
  group by
    categories;

  --5. 将炸开的类别和类别热度表关联,并按热度排序
  select
    distinct t4.categories,
    ch.hot
  from
    t4
  join
    ch
  on
    t4.categories = ch.categories
  order by
    hot desc;   

7)统计每个类别中的视频热度Top10,以Music为例

 --1.生成中间表格
  create table video_category stored as orc as
  select
    videoid,
    uploader,
    age,
    categories,
    length,
    views,
    rate,
    ratings,
    comments,
    relatedid
  from
    video_orc
  lateral view explode(category) tbl as categories;

  --2. 统计Music类视频热度Top10
  select
    videoid,
    views
  from
    video_category
  where
    categories="Music"
  order by
    views desc
  limit 10;

8)统计每个类别视频观看数Top10

  --1. 给分类内部视频按照观看数排名
  select
    categories,
    videoid,
    views,
    rank() over(partition by categories order by views desc) rk
  from
    video_category;
  --2. 取前十
  select
    categories,
    videoid,
    views
  from
    t1
  where
    rk<=10;
原文地址:https://www.cnblogs.com/xiao-bu/p/14328016.html