单词计数示例

一、代码

import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{

   public static void main(String[] args) throws Exception {
       // TODO Auto-generated method stub
       int exitcode=ToolRunner.run(new WordCount(), args);
       System.exit(exitcode);

   }

   // TODO Auto-generated method stub
   private final static IntWritable one = new IntWritable(1);
   private static Text word = new Text();

   public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {

       @Override
       public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
               throws IOException {
           String line = value.toString();
           StringTokenizer tokenizer = new StringTokenizer(line);
           while (tokenizer.hasMoreTokens()) {
               word.set(tokenizer.nextToken());
               output.collect(word, one);
           }
       }

   }

   public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {

       @Override
       public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output,
               Reporter reporter) throws IOException {
           int sum = 0;
           while (values.hasNext()) {
               sum += values.next().get();
           }
           output.collect(key, new IntWritable(sum));
       }

   }

   @Override
   public int run(String[] args) throws Exception {
       // TODO Auto-generated method stub
       if(args.length!=2) {
           System.err.printf("Usage %s need <input> <output> ", getClass().getSimpleName());
           ToolRunner.printGenericCommandUsage(System.err);
           return -1;
       }
       System.out.print(args[0]);
       JobConf job=new JobConf(getConf());
       job.setJarByClass(getClass());
       job.setJobName("wordcount");

       job.setInputFormat(TextInputFormat.class);    //为map-reduce任务设置InputFormat实现类
       job.setOutputFormat(TextOutputFormat.class);

       FileInputFormat.addInputPath(job, new Path(args[0]));
       FileOutputFormat.setOutputPath(job, new Path(args[1]));


       job.setOutputKeyClass(Text.class);    //为job的输出数据设置Key类
       job.setOutputValueClass(IntWritable.class);

       job.setMapperClass(WordCount.Map.class);
       job.setCombinerClass(WordCount.Reduce.class);
       job.setReducerClass(WordCount.Reduce.class);
       JobClient.runJob(job);
       return 0;
   }

}
二、执行

1、本地执行

export HADOOP_CONF_DIR=/root/soft/hdp312/localconf/hadoop
hadoop jar wordcount.jar WordCount --input testinput --output testoutput

三、流模式运行

--mapper

--reducer

以上二者可以指定脚本命令或java类都可以

脚本可以对基本输入和输出进行读写

hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.1.2.jar -input testinput/ -output testoutput/ -mapper /bin/cat