sparksql json 合并json数据

java

 1 public class Demo {
 2     private static SparkConf conf = new SparkConf().setAppName("demo").setMaster("local");
 3     private static JavaSparkContext jsc = new JavaSparkContext(conf);
 4     private static SparkSession session = new SparkSession(jsc.sc());
 5 
 6     public static void main(String[] args) {
 7 
 8         // 加载students.json name,score
 9         Dataset<Row> score = session.read().json("./src/main/java/cn/tele/spark_sql/json/students.json");
10 
11         score.createOrReplaceTempView("scoreView");
12 
13         // name,score
14         JavaRDD<Row> scoreRDD = session.sql("select * from scoreView where score > 80").javaRDD();
15 
16         // 创建信息json name,age
17         JavaRDD<String> infoRDD = jsc.parallelize(Arrays.asList("{"name":"Leo","age":18}",
18                 "{"name":"Marry","age":19}", "{"name":"Jack","age":20}"));
19 
20         Dataset<Row> info = session.read().json(infoRDD);
21         info.createOrReplaceTempView("infoView");
22 
23         // 拼接sql
24         List<Row> scoreList = scoreRDD.collect();
25 
26         String sql = "select * from infoView where name in (";
27         for (int i = 0; i < scoreList.size(); i++) {
28             sql += "'" + scoreList.get(i).getAs("name") + "'";
29             if (i < scoreList.size() - 1) {
30                 sql += ",";
31             }
32         }
33 
34         sql += ")";
35 
36         // 查询 分数>80的学生的name,age
37 
38         // 转换
39         JavaPairRDD<String, Integer> tempRDD = session.sql(sql).javaRDD()
40                 .mapToPair(new PairFunction<Row, String, Integer>() {
41 
42                     private static final long serialVersionUID = 1L;
43 
44                     @Override
45                     public Tuple2<String, Integer> call(Row t) throws Exception {
46                         return new Tuple2<String, Integer>(t.getAs("name"), Integer.valueOf(t.getAs("age").toString()));
47                     }
48                 });
49 
50         JavaPairRDD<String, Integer> scoreRDD2 = scoreRDD.mapToPair(new PairFunction<Row, String, Integer>() {
51 
52             private static final long serialVersionUID = 1L;
53 
54             @Override
55             public Tuple2<String, Integer> call(Row t) throws Exception {
56                 return new Tuple2<String, Integer>(t.getAs("name"), Integer.valueOf(t.getAs("score").toString()));
57             }
58         });
59 
60         // join
61         JavaPairRDD<String, Tuple2<Integer, Integer>> resultRDD = tempRDD.join(scoreRDD2);
62 
63         // 遍历
64         resultRDD.foreach(new VoidFunction<Tuple2<String, Tuple2<Integer, Integer>>>() {
65 
66             private static final long serialVersionUID = 1L;
67 
68             @Override
69             public void call(Tuple2<String, Tuple2<Integer, Integer>> t) throws Exception {
70                 System.out.println("name:" + t._1 + "," + "age:" + t._2._1 + ",score:" + t._2._2);
71             }
72         });
73 
74         // 保存为json格式
75         StructType schema = DataTypes
76                 .createStructType(Arrays.asList(DataTypes.createStructField("name", DataTypes.StringType, false),
77                         DataTypes.createStructField("age", DataTypes.IntegerType, false),
78                         DataTypes.createStructField("score", DataTypes.IntegerType, false)));
79 
80         JavaRDD<Row> rowRDD = resultRDD.map(new Function<Tuple2<String, Tuple2<Integer, Integer>>, Row>() {
81 
82             private static final long serialVersionUID = 1L;
83 
84             @Override
85             public Row call(Tuple2<String, Tuple2<Integer, Integer>> v1) throws Exception {
86                 return RowFactory.create(v1._1, Integer.valueOf(v1._2._1), Integer.valueOf(v1._2._2));
87             }
88         });
89 
90         Dataset<Row> resultDS = session.createDataFrame(rowRDD, schema);
91 
92         resultDS.write().format("json").mode(SaveMode.Append).save("./src/main/java/cn/tele/spark_sql/json/result");
93 
94         session.stop();
95         jsc.close();
96     }
97 }

scala

 1 object Demo {
 2   def main(args: Array[String]): Unit = {
 3     val conf = new SparkConf().setAppName("demo").setMaster("local")
 4     val sc = new SparkContext(conf)
 5     val sqlContext = new SQLContext(sc)
 6 
 7     //加载score 信息
 8     val scoreDF = sqlContext.read.json("./src/main/scala/cn/tele/spark_sql/json/students.json")
 9 
10     scoreDF.createOrReplaceTempView("scoreView")
11 
12     val arr = sqlContext.sql("select * from scoreView where score > 80").rdd.collect()
13 
14     //创建 学生信息
15     val infoRDD = sc.parallelize(Array(
16       "{"name":"Leo","age":20}",
17       "{"name":"Marry","age":30}",
18       "{"name":"Jack","age":21}"), 2)
19 
20     val infoDS = sqlContext.read.json(infoRDD)
21 
22     infoDS.createOrReplaceTempView("infoView")
23 
24     var sql = "select * from infoView where name in ("
25     //拼接sql
26     for (i <- 0 to arr.length - 1) {
27       sql += "'" + arr(i).getAs[String]("name") + "'"
28       if (i < arr.length - 1) {
29         sql += ","
30       }
31     }
32 
33     sql += ")"
34 
35     val tempRDD = sqlContext.sql(sql).rdd.map(row => {
36       (row.getAs[String]("name"), row.getAs[Long]("age").toInt)
37     })
38 
39     val tempRDD2 = scoreDF.rdd.map(row => {
40       (row.getAs[String]("name"), row.getAs[Long]("score").toInt)
41     })
42 
43     //join
44     val resultRDD = tempRDD.join(tempRDD2)
45 
46     //遍历
47     resultRDD.foreach(t => {
48       println("name:" + t._1 + "age:" + t._2._1 + "score:" + t._2._2)
49     })
50 
51     val rowRDD = resultRDD.map(t => Row(t._1, t._2._1, t._2._2))
52 
53     //保存为json文件
54     val schema = DataTypes.createStructType(Array(
55       StructField("name", DataTypes.StringType, false),
56       StructField("age", DataTypes.IntegerType, false),
57       StructField("score", DataTypes.IntegerType, false)))
58 
59     val df = sqlContext.createDataFrame(rowRDD, schema)
60 
61     df.write.format("json").mode(SaveMode.Append).save("./src/main/scala/cn/tele/spark_sql/json/result")
62   }
63 }
原文地址:https://www.cnblogs.com/tele-share/p/10391828.html