项目实战从0到1之hive(45)大数据项目之电商数仓(用户行为数据)(十三)

第20章 需求九:每个用户累计访问次数

结果如下

用户    日期                小计    总计
mid1    2019-12-14        10        10
mid1    2019-02-11        12        22
mid2    2019-12-14        15        15
mid2    2019-02-11        12        27

20.1 DWS层

20.1.1 建表语句

drop table if exists dws_user_total_count_day;
create external table dws_user_total_count_day(
  `mid_id` string COMMENT '设备id',
`subtotal` bigint COMMENT '每日登录小计'
)
partitioned by(`dt` string)
row format delimited fields terminated by ' '
location '/warehouse/gmall/dws/dws_user_total_count_day';

20.1.2 导入数据

1)导入数据

insert overwrite table dws_user_total_count_day 
partition(dt='2019-12-14')
select
  mid_id,
   count(mid_id) cm
from
dwd_start_log
where
  dt='2019-12-14'
group by
  mid_id;

2)查询结果

select * from dws_user_total_count_day;

20.1.3 数据导入脚本

1)创建脚本dws_user_total_count_day.sh

[kgg@hadoop102 bin]$ vim dws_user_total_count_day.sh
在脚本中填写如下内容
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
  do_date=$1
else
  do_date=`date -d "-1 day" +%F`
fi

echo "===日志日期为 $do_date==="
sql="
insert overwrite table "$APP".dws_user_total_count_day partition(dt='$do_date')
select
  mid_id,
  count(mid_id) cm
from
   "$APP".dwd_start_log
where
   dt='$do_date'
group by
  mid_id,dt;
"

$hive -e "$sql"

2)增加脚本执行权限

chmod 777 ads_user_total_count.sh

3)脚本使用

 ads_user_total_count.sh 2019-02-20

4)查询结果

select * from ads_user_total_count;

5)脚本执行时间

企业开发中一般在每天凌晨30分~1点

20.2 ADS层

20.2.1 建表语句

drop table if exists ads_user_total_count;
create external table ads_user_total_count(
  `mid_id` string COMMENT '设备id',
  `subtotal` bigint COMMENT '每日登录小计',
  `total` bigint COMMENT '登录次数总计'
)
partitioned by(`dt` string)
row format delimited fields terminated by ' '
location '/warehouse/gmall/ads/ads_user_total_count';

20.2.2 导入数据

insert overwrite table ads_user_total_count partition(dt='2019-10-03')
select
if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
today.subtotal,
if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
from (
 select
  *
 from dws_user_total_count_day
 where dt='2019-10-03'
) today
full join (
 select
  *
 from ads_user_total_count
 where dt=date_add('2019-10-03', -1)
) yesterday
on today.mid_id=yesterday.mid_id

20.2.3 数据导入脚本

1)创建脚本

[kgg@hadoop102 bin]$ vim ads_user_total_count.sh
在脚本中编写如下内容
#!/bin/bash

db=gmall
hive=/opt/module/hive-1.2.1/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

if [[ -n $1 ]]; then
   do_date=$1
else
   do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert overwrite table ads_user_total_count partition(dt='$do_date')
select
 if(today.mid_id is null, yesterday.mid_id, today.mid_id) mid_id,
today.subtotal,
 if(today.subtotal is null, 0, today.subtotal) + if(yesterday.total is null, 0, yesterday.total) total
from (
select
  *
from dws_user_total_count_day
where dt='$do_date'
) today
full join (
select
  *
from ads_user_total_count
where dt=date_add('$do_date', -1)
) yesterday
on today.mid_id=yesterday.mid_id
"

$hive -e "$sql"

2)增加脚本执行权限

chmod 777 ads_user_total_count.sh

3)脚本使用

ads_user_total_count.sh 2019-02-20

4)查询结果

select * from ads_user_total_count;

5)脚本执行时间

企业开发中一般在每天凌晨30分~1点

第21章 需求十:新收藏用户数

新收藏用户:指的是在某天首次添加收藏的用户

21.1 DWS层建立用户日志行为宽表

考虑到后面的多个需求会同时用到多张表中的数据, 如果每次都join操作, 则影响查询的效率. 可以先提前做一张宽表, 提高其他查询的执行效率.

每个用户对每个商品的点击次数, 点赞次数, 收藏次数

21.1.1 建表语句

drop table if exists dws_user_action_wide_log;
CREATE EXTERNAL TABLE dws_user_action_wide_log(
  `mid_id` string COMMENT '设备id',
  `goodsid` string COMMENT '商品id',
  `display_count` string COMMENT '点击次数',
  `praise_count` string COMMENT '点赞次数',
  `favorite_count` string COMMENT '收藏次数')
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_action_wide_log/'
TBLPROPERTIES('parquet.compression'='lzo');

21.1.2 导入数据

insert overwrite table dws_user_action_wide_log partition(dt='2019-12-14')
select
  mid_id,
  goodsid,
  sum(display_count) display_count,
  sum(praise_count) praise_count,
  sum(favorite_count) favorite_count
from
( select
      mid_id,
      goodsid,
       count(*) display_count,
       0 praise_count,
       0 favorite_count
   from
      dwd_display_log
   where
      dt='2019-12-14' and action=2
   group by
      mid_id,goodsid

   union all

   select
      mid_id,
      target_id goodsid,
       0,
       count(*) praise_count,
       0
   from
      dwd_praise_log
   where
      dt='2019-12-14'
   group by
      mid_id,target_id

   union all

   select
      mid_id,
      course_id goodsid,
       0,
       0,
       count(*) favorite_count
   from
      dwd_favorites_log
   where
      dt='2019-12-14'
   group by
      mid_id,course_id
)user_action
group by
mid_id,goodsid;

21.1.3 数据导入脚本

[kgg@hadoop102 bin]$ vi dws_user_action_wide_log.sh
[kgg@hadoop102 bin]$ chmod 777 dws_user_action_wide_log.sh

#!/bin/bash
db=gmall
hive=/opt/module/hive-1.2.1/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

if [[ -n $1 ]]; then
   do_date=$1
else
   do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert overwrite table dws_user_action_wide_log partition(dt='$do_date')
select
  mid_id,
  goodsid,
  sum(display_count) display_count,
  sum(praise_count) praise_count,
  sum(favorite_count) favorite_count
from
( select
      mid_id,
      goodsid,
      count(*) display_count,
       0 praise_count,
       0 favorite_count
  from
      dwd_display_log
  where
       dt='$do_date' and action=2
  group by
      mid_id,goodsid

  union all

  select
      mid_id,
      target_id goodsid,
       0,
      count(*) praise_count,
       0
  from
      dwd_praise_log
  where
       dt='$do_date'
  group by
      mid_id,target_id

  union all

  select
      mid_id,
      course_id goodsid,
       0,
       0,
      count(*) favorite_count
  from
      dwd_favorites_log
  where
       dt='$do_date'
  group by
      mid_id,course_id
)user_action
group by
mid_id,goodsid;
"

$hive -e "$sql"

21.2 DWS层

使用日志数据用户行为宽表作为DWS层表

21.3 ADS层

21.3.1 建表语句

drop table if exists ads_new_favorites_mid_day;
create external table ads_new_favorites_mid_day(
  `dt` string COMMENT '日期',
  `favorites_users` bigint COMMENT '新收藏用户数'
)
row format delimited fields terminated by ' '
location '/warehouse/gmall/ads/ads_new_favorites_mid_day';

21.3.2 导入数据

insert into table ads_new_favorites_mid_day
select
   '2019-12-14' dt,
   count(*) favorites_users
from
(
   select
      mid_id
   from
      dws_user_action_wide_log
   where
      favorite_count>0
   group by
      mid_id
   having
      min(dt)='2019-12-14'
)user_favorite;

21.3.3 数据导入脚本

1)创建脚本ads_new_favorites_mid_day.sh

[kgg@hadoop102 bin]$ vim ads_new_favorites_mid_day.sh
在脚本中填写如下内容
#!/bin/bash

# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop-2.7.2/bin/hadoop

# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
  do_date=$1
else
  do_date=`date -d "-1 day" +%F`
fi

echo "===日志日期为 $do_date==="
sql="
insert into table "$APP".ads_new_favorites_mid_day
select
   '$do_date' dt,
  count(*) favorites_users
from
(
  select
      mid_id
  from
       "$APP".dws_user_action_wide_log
  where
      favorite_count>0
  group by
      mid_id
  having
      min(dt)='$do_date'
)user_favorite;
"

$hive -e "$sql"

2)增加脚本执行权限

chmod 777 ads_new_favorites_mid_day.sh

3)脚本使用

ads_new_favorites_mid_day.sh 2019-02-20

4)查询结果

select * from ads_new_favorites_mid_day;

5)脚本执行时间

企业开发中一般在每天凌晨30分~1点

第22章 需求十一:各个商品点击次数top3的用户

22.1 DWS层

使用日志数据用户行为宽表作为DWS层表

22.2 ADS层

22.2.1 建表语句

drop table if exists ads_goods_count;
create external table ads_goods_count(
  `dt` string COMMENT '统计日期',
  `goodsid` string COMMENT '商品',
  `user_id` string COMMENT '用户',
  `goodsid_user_count` bigint COMMENT '商品用户点击次数'
)
row format delimited fields terminated by ' '
location '/warehouse/gmall/ads/ads_goods_count';

22.2.2 导入数据

insert into table ads_goods_count
select
   '2019-10-03',
  goodsid,
  mid_id,
  sum_display_count
from(
   select
    goodsid,
    mid_id,
    sum_display_count,
    row_number() over(partition by goodsid order by sum_display_count desc) rk
   from(
     select
      goodsid,
      mid_id,
      sum(display_count) sum_display_count
     from dws_user_action_wide_log
     where display_count>0
     group by goodsid, mid_id
  ) t1
) t2
where rk <= 3

22.2.3 数据导入脚本

1)创建脚本ads_goods_count.sh

[kgg@hadoop102 bin]$ vim ads_goods_count.sh
在脚本中填写如下内容
#!/bin/bash

db=gmall
hive=/opt/module/hive/bin/hive
hadoop=/opt/module/hadoop/bin/hadoop

if [[ -n $1 ]]; then
   do_date=$1
else
   do_date=`date -d '-1 day' +%F`
fi

sql="
use gmall;
insert into table ads_goods_count
select
   '$do_date',
  goodsid,
  mid_id,
  sum_display_count
from(
  select
    goodsid,
    mid_id,
    sum_display_count,
    row_number() over(partition by goodsid order by sum_display_count desc) rk
  from(
    select
      goodsid,
      mid_id,
      sum(display_count) sum_display_count
    from dws_user_action_wide_log
    where display_count>0
    group by goodsid, mid_id
  ) t1
) t2
where rk <= 3
"
$hive -e "$sql"

2)增加脚本执行权限

chmod 777 ads_goods_count.sh

3)脚本使用

ads_goods_count.sh 2019-02-20

4)查询结果

select * from ads_goods_count;

5)脚本执行时间

企业开发中一般在每天凌晨30分~1点

作者:大码王

-------------------------------------------

个性签名:独学而无友,则孤陋而寡闻。做一个灵魂有趣的人!

如果觉得这篇文章对你有小小的帮助的话,记得在右下角点个“推荐”哦,博主在此感谢!

万水千山总是情,打赏一分行不行,所以如果你心情还比较高兴,也是可以扫码打赏博主,哈哈哈(っ•?ω•?)っ???!

原文地址:https://www.cnblogs.com/huanghanyu/p/14308745.html