小波的面试题

被朋友问到一个面试题,实在没有时间,最近忙着投产,太忙了,所以晚上抽出时间,写封博客给他看看。

 思路:

spark ->df->contextsql 用sql做

--可能难度在自增列上
表名:
label_table

select row_number() over() as lid,label,count(1) 
from (
select explode(labels) as label
 from 
   (
    select array(age,region,sex,job) as labels 
     from
     (
      select 
      age,sex,
      case when age>18 and age<35 then '青年'
        when age<18 then '少年'
        when age >35 and age<55 then '中年'
        when age>55 then '老年' end as age
        from userinfo
        )t0 
   )t1
 ) t2 group by label;
 
 
 
 
 表名:
 user_label_table
 select split(label,'+')[1] as uid,split(label,'+')[2] as label
 from 
  (
  select explode(labels) as label
  from 
  (
  select array(age,region,sex,job) as labels 
      from
      (
        select 
         concat(id,'+',age)  as age,
         concat(id,'+',sex) as sex,
         concat(job,'+',job) as job,
         case when age>18 and age<35 then concat(uid,'+青年')
             when age<18 then concat(uid,'少年')
             when age >35 and age<55 then concat(uid,'中年')
             when age>55 then '老年' end as age
               from userinfo
      )t0 
  )t1
 
 uid_lid_table 
 --转成 uid + lids
 select t0.uid,array(collect_set(lid)) as lids
 from 
 (
 select t0.uid ,t1.lid from
 user_label_table t0
 join label_table t1
 on t0.label=t1.label
 )
 group by t0.uid;
 
 
 
 --订单数据表 orders_table


 
 
 输出如下:
 select t0.oid,t1.lids[1],t1.lids[2],t1.lid[3],t1.lid[4],t2.ptype 
 from orders_table t0
 join uid_lid_table t1 on t0.uid=t1.uid
 join product_table t2 on t0.pid=t2.pid
 
原文地址:https://www.cnblogs.com/wqbin/p/10693668.html