spark sql在scala与java中的代码实现

在编写spark sql代码前,需要新建maven工程,将hadoop下的配置文件core-site.xml和hdfs-site.xml,以及hive中的hive-site.xml拷贝到工程的resource目录下,并在pom.xml中配置jar包信息。

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>begin</groupId>
<artifactId>myspark</artifactId>
<version>1.0-SNAPSHOT</version>

<properties>
<spark.version>2.4.3</spark.version>
<scala.version>2.11.12</scala.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.3</version>
</dependency>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.12</version>
</dependency>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-reflect</artifactId>
<version>2.11.12</version>
</dependency>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-compiler</artifactId>
<version>2.11.12</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.28</version>
</dependency>


<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.3</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.4.3</version>
</dependency>

</dependencies>

</project>
scala实现

import org.apache.spark.sql.SparkSession

/**
* 使用scala实现spark sql访问
*/
object SparkSqlDemoScala {
def main(args: Array[String]):Unit= {
val spark=SparkSession.builder().appName("SparkSql").master("local[*]").enableHiveSupport().getOrCreate()
val rdd1=spark.sparkContext.textFile("/user/hadoop/data2/wc.txt")
val rdd2=rdd1.flatMap(_.split(" "))
//导入sparksession的隐式转换
import spark.implicits._
//将rdd转换成数据框
val df=rdd2.toDF("word")
//将数据框注册成临时视图
df.createOrReplaceTempView("_doc")
spark.sql("select word,count(*) from _doc group by word").show(1000,false)
}
}
JAVA实现:

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.Arrays;
import java.util.Iterator;

/**
* 使用java实现spark sql访问
*/
public class SparkSQLDemoJava {
public static void main(String[] args) {
SparkSession spark= SparkSession.builder().appName("sparkSQL").master("local").enableHiveSupport().getOrCreate();
//创建javaSpark上下文
JavaSparkContext sc=new JavaSparkContext(spark.sparkContext());
//加载文件
JavaRDD<String> rdd1=sc.textFile("/user/hadoop/data2/wc.txt");
JavaRDD<String> rdd2=rdd1.flatMap(new FlatMapFunction<String,String>(){
public Iterator<String> call(String s) throws Exception{
return Arrays.asList(s.split(" ")).iterator();
}
});
//将string 变换成 row
JavaRDD<Row> rdd3=rdd2.map(new Function<String,Row>(){
public Row call(String word) throws Exception{
return RowFactory.create(word);
}
});
//构造表结构
StructField[] fields=new StructField[1];
fields[0]=new StructField("word", DataTypes.StringType,true, Metadata.empty());
//表结构类型
StructType type=new StructType(fields);
//将RDD转换成DataFrame
Dataset<Row> df=spark.createDataFrame(rdd3,type);
//注册临时视图
df.createOrReplaceTempView("_doc");

spark.sql("select word,count(*) from _doc group by word").show(1000,false);

}

————————————————
版权声明:本文为CSDN博主「赵厚雄」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/nengyu/article/details/95870479

原文地址:https://www.cnblogs.com/javalinux/p/15069270.html