spark双重key聚合

1、双重key聚合解决数据倾斜:个别key数据量过大,导致数据倾斜。在key前加随机前缀,先进行一次预聚合,然后再将随机前缀去掉,再进行一次聚合。

// 注册自定义函数
        spark.udf().register("concat_String_string", new ConcatStringStringUDF(), DataTypes.StringType);
        spark.udf().register("random_prefix", new RandomPrefixUDF(), DataTypes.StringType);
        spark.udf().register("remove_random_prefix", new RemoveRandomPrefixUDF(), DataTypes.StringType);

        spark.udf().register("group_concat_distinct",new GroupConcatDistinctUDAF());
package com.bjsxt.spark.areaRoadFlow;

import java.util.Random;

import org.apache.spark.sql.api.java.UDF2;

public class RandomPrefixUDF implements UDF2<String, Integer, String>{

    /**
     * 
     */
    private static final long serialVersionUID = 1L;

    @Override
    public String call(String area_name_road_id, Integer ranNum) throws Exception {
        Random random = new Random();
        int prefix = random.nextInt(ranNum);
        return prefix+"_"+area_name_road_id;
    }

}
private static void generateTempAreaRoadFlowTable(SparkSession spark) {
        /**
         *     structFields.add(DataTypes.createStructField("area_id", DataTypes.StringType, true));  
         *    structFields.add(DataTypes.createStructField("area_name", DataTypes.StringType, true));
         *    structFields.add(DataTypes.createStructField("road_id", DataTypes.StringType, true));
         *    structFields.add(DataTypes.createStructField("monitor_id", DataTypes.StringType, true));  
         *    structFields.add(DataTypes.createStructField("car", DataTypes.StringType, true));  
         */
        String sql = 
                "SELECT "
                    + "area_name,"
                    + "road_id,"
                    + "count(*) car_count,"
                    //group_concat_distinct 统计每一条道路中每一个卡扣下的车流量
                    + "group_concat_distinct(monitor_id) monitor_infos "//0001=20|0002=30
                + "FROM tmp_car_flow_basic "
                + "GROUP BY area_name,road_id";
        /**
         * 下面是当遇到区域下某个道路车辆特别多的时候,会有数据倾斜,怎么处理?random
         */
         String sqlText = ""
                + "SELECT "
                    + "area_name_road_id,"
                    + "sum(car_count),"
                    + "group_concat_distinct(monitor_infos) monitor_infoss "
                + "FROM ("
                    + "SELECT "
                        + "remove_random_prefix(prefix_area_name_road_id) area_name_road_id,"
                        + "car_count,"
                        + "monitor_infos "
                    + "FROM ("
                        + "SELECT "
                            + "prefix_area_name_road_id,"//1_鼓楼区:49
                            + "count(*) car_count,"
                            + "group_concat_distinct(monitor_id) monitor_infos "
                        + "FROM ("
                            + "SELECT "
                            + "monitor_id,"
                            + "car,"
                            + "random_prefix(concat_String_string(area_name,road_id,':'),10) prefix_area_name_road_id "
                            + "FROM tmp_car_flow_basic "
                        + ") t1 "
                        + "GROUP BY prefix_area_name_road_id "
                    + ") t2 "
                + ") t3 "
                + "GROUP BY area_name_road_id";


        Dataset<Row> df = spark.sql(sql);
    
        df.registerTempTable("tmp_area_road_flow_count"); 
    }
原文地址:https://www.cnblogs.com/guoyu1/p/12305221.html