多文件的wc程序【java版】

package sanjin;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;

import java.util.Arrays;

/**
 * Description: java 版本的多文件的wc程序
 *
 * mvn clean install
 * rz jar
 * ./bin./submit --class sanjin.JavaWordCount /home/hadoop/jar/test1.0.jar /spark-test/input /spark-test/javaoutput
 *      命令                 方法名            jar 包位置               输入文件位置       文件输出位置
 * @Author: 留歌36
 * @Date: 2019/3/6 16:42
 */
public class JavaWordCount {

    public static void main(String[] args) {
        String inputPath =args[0];
        String outputPath = args[1];

        SparkSession sparkSession = SparkSession.builder()
                .appName(Thread.currentThread().getStackTrace()[1].getClassName())
//                .master("local[2]")
                .getOrCreate();

        JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());


        JavaPairRDD<String,String> textFiles = sc.wholeTextFiles(inputPath);
        JavaPairRDD<String,Integer> counts = textFiles.flatMap(s -> Arrays.asList(s._2.split("\s+")).iterator())
                .mapToPair(word -> new Tuple2<String, Integer>(word, 1))
                .reduceByKey((x, y) -> x+y);

        counts.saveAsTextFile(outputPath);

        sparkSession.stop();


    }

}

原文地址:https://www.cnblogs.com/liuge36/p/12614752.html