单词计数示例

一、代码

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int exitcode=ToolRunner.run(new WordCount(), args);
        System.exit(exitcode);

    }

    // TODO Auto-generated method stub
    private final static IntWritable one = new IntWritable(1);
    private static Text word = new Text();

    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {
            String line = value.toString();
            StringTokenizer tokenizer = new StringTokenizer(line);
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken());
                output.collect(word, one);
            }
        }

    }

    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
                Reporter reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            output.collect(key, new IntWritable(sum));
        }

    }

    @Override
    public int run(String[] args) throws Exception {
        // TODO Auto-generated method stub
        if(args.length!=2) {
            System.err.printf("Usage %s need <input> <output> ", getClass().getSimpleName());
            ToolRunner.printGenericCommandUsage(System.err);
            return -1;
        }
        System.out.print(args[0]);
        JobConf job=new JobConf(getConf());
        job.setJarByClass(getClass());
        job.setJobName("wordcount");
        
        job.setInputFormat(TextInputFormat.class);    //为map-reduce任务设置InputFormat实现类
        job.setOutputFormat(TextOutputFormat.class);  
        
        FileInputFormat.addInputPath(job, new Path(args[0]));
       FileOutputFormat.setOutputPath(job,  new Path(args[1]));

       
       job.setOutputKeyClass(Text.class);    //为job的输出数据设置Key类
       job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(WordCount.Map.class);
        job.setCombinerClass(WordCount.Reduce.class);
        job.setReducerClass(WordCount.Reduce.class);
        JobClient.runJob(job);
        return 0;
    }

}
二、执行

1、本地执行

export HADOOP_CONF_DIR=/root/soft/hdp312/localconf/hadoop
hadoop jar wordcount.jar  WordCount --input testinput --output  testoutput

三、流模式运行

--mapper

--reducer

以上二者可以指定脚本命令或java类都可以

脚本可以对基本输入和输出进行读写

hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.1.2.jar   -input testinput/ -output testoutput/  -mapper /bin/cat

原文地址:https://www.cnblogs.com/justart/p/11631235.html