hadoop-WordCount案例

1.导入依赖

 <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>

    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>RELEASE</version>
    </dependency>

    <dependency>
      <groupId>org.apache.logging.log4j</groupId>
      <artifactId>log4j-core</artifactId>
      <version>2.9.1</version>
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.7.2</version>
    </dependency>

    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>2.7.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>2.7.2</version>
    </dependency>

    <dependency>
      <groupId>jdk.tools</groupId>
      <artifactId>jdk.tools</artifactId>
      <version>1.8</version>
      <scope>system</scope>
      <systemPath>E:/jdk1.8.0_73/jdk1.8.0_73/lib/tools.jar</systemPath>
    </dependency>
  </dependencies>

2.编写WcMapper

package com.wn.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.junit.Test;

import java.io.IOException;

public class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable> {

    private Text word=new Text();
    private IntWritable one=new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //拿到这一行数据
        String line = value.toString();
        //按照空格切分数据
        String[] words = line.split(" ");
        //遍历数组
        for (String word:words){
            this.word.set(word);
            context.write(this.word,this.one);
        }
    }
}

3.编写WcReducer

package com.wn.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WcReducer extends Reducer<Text, IntWritable,Text,IntWritable> {

    private IntWritable total=new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //做累加
        int sum=0;
        for (IntWritable value : values) {
            sum+=value.get();
        }
        //包装结果并输出
        total.set(sum);
        context.write(key,total);
    }
}

4.编写WcDriver

package com.wn.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //获取一个Job实例
        Job job = Job.getInstance(new Configuration());
        //设置类路径
        job.setJarByClass(WcDriver.class);
        //设置mapper和reducer
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReducer.class);
        //设置mapper和reducer输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //设置输入的数据
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        //提交job
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    }
}

5.运行打包

  

  打包完成后,会在对应的target目录下生成.jar文件

  

6.上传并执行

  6.1 使用【rz】命令将.jar文件上传到hadoop上

    

  6.2 运行wordcount程序

hadoop jar hadoop-1.0-SNAPSHOT.jar com.wn.wordcount.WcDriver /input.txt /output

    执行完成后会在HDFS上生成对应的output文件

    

   6.3 查看运行结果

hadoop fs -cat /output/*

    

7.WordCount流程

  7.1 将文件拆分成splits,由于测试用的文件较小,所以每个文件为一乐splits,并将文件按行分割形成<key,value>对,key为偏移量,value为文本行;这一步有MapReduce框架自动完成;

    

   7.2 将分给好的<key,value>对交给用户定义的map方法进行处理,生成新的<key,value>对;

    

  7.3 得到map方法输出的<key,value>对后,mapper会将他们按照key值进行排序,并执行Combine过程,将key值相同的value累加,得到mapper的最终输出结果;

    

  7.4 reducer先对从mapper接收的数据进行排序,在交由用户自定义的reduce方法进行处理,得到新的<key,value>对,并作为wordcount的输出结果;

    

原文地址:https://www.cnblogs.com/wnwn/p/12599167.html