使用hadoop统计多个文本中每个单词数目

程序源码

 1 import java.io.IOException;
 2 import java.util.StringTokenizer;
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.fs.Path;
 5 import org.apache.hadoop.io.IntWritable;
 6 import org.apache.hadoop.io.LongWritable;
 7 import org.apache.hadoop.io.Text;
 8 import org.apache.hadoop.mapreduce.Job;
 9 import org.apache.hadoop.mapreduce.Mapper;
10 import org.apache.hadoop.mapreduce.Reducer;
11 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 
16 public class WordCount {
17     public static class WordCountMap extends
18             Mapper<LongWritable, Text, Text, IntWritable> {
19         private final IntWritable one = new IntWritable(1);//输出的值  1
20         private Text word = new Text();//输出的键 单词
21 
22         public void map(LongWritable key, Text value, Context context)
23                 throws IOException, InterruptedException {//处理经过  TextInputFormat  产生的  <k1,v1>，然后产生 <k2,v2>
24             String line = value.toString();//读取文本中
25             StringTokenizer token = new StringTokenizer(line);//按照空格对单词进行切割
26             while (token.hasMoreTokens()) {
27                 word.set(token.nextToken());//读取到的单词作为键值
28                 context.write(word, one);//以  单词,1的中间形式交给reduce处理
29             }
30         }
31     }
32 
33     public static class WordCountReduce extends
34             Reducer<Text, IntWritable, Text, IntWritable> {
35         public void reduce(Text key, Iterable<IntWritable> values,
36                 Context context) throws IOException, InterruptedException {
37             int sum = 0;
38             for (IntWritable val : values) {
39                 sum += val.get();
40             }
41             context.write(key, new IntWritable(sum));
42         }
43     }
44 
45     public static void main(String[] args) throws Exception {
46         Configuration conf = new Configuration();
47         Job job = new Job(conf);
48         job.setJarByClass(WordCount.class);
49         job.setJobName("wordcount");
50         job.setOutputKeyClass(Text.class);
51         job.setOutputValueClass(IntWritable.class);
52         job.setMapperClass(WordCountMap.class);
53         job.setReducerClass(WordCountReduce.class);
54         job.setInputFormatClass(TextInputFormat.class);//生成可供Map处理的键值对
55         job.setOutputFormatClass(TextOutputFormat.class);
56         FileInputFormat.addInputPath(job, new Path(args[0]));
57         FileOutputFormat.setOutputPath(job, new Path(args[1]));
58         job.waitForCompletion(true);
59     }
60 }

1 编译源码

javac -classpath /opt/hadoop-1.2.1/hadoop-core-1.2.1.jar:/opt/hadoop-1.2.1/lib/commons-cli-1.2.jar -d ./word_count_class/ WordCount.java
将源码编译成class文件并放在当前文件夹下的word_count_class目录，当然，首先需要创建该目录

2 将源码打成jar包

进入源码目录

jar -cvf wordcount.jar *

3 上传输入文件

先在hadoop中为本次任务创建一个输入文件存放目录

hadoop fs -mkdir input_wordcount

将input目录下的所有文本文件上传到hadoop中的input_wordcount目录下

hadoop fs -put input/* input_wordcount/

注意：不能在运行前穿创建输出文件夹

4 上传jar并执行

hadoop jar word_count_class/wordcount.jar input_wordcount output_wordcount

5 查看计算结果

程序输出目录

hadoop fs -ls output_wordcount

程序输出内容

hadoop fs -cat output_wordcount/part-r-00000

版本二：自己实际操作中的程序

Map程序

 1 package com.zln.chapter03;
 2 
 3 import org.apache.hadoop.io.IntWritable;
 4 import org.apache.hadoop.io.LongWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapred.MapReduceBase;
 7 import org.apache.hadoop.mapred.Mapper;
 8 import org.apache.hadoop.mapred.OutputCollector;
 9 import org.apache.hadoop.mapred.Reporter;
10 
11 import java.io.IOException;
12 import java.util.StringTokenizer;
13 
14 /**
15  * Created by sherry on 15-7-12.
16  */
17 public class WordCountMap extends MapReduceBase implements Mapper<LongWritable,Text,Text,IntWritable> {
18     private final static IntWritable one = new IntWritable(1);//每个单词 +1
19     private Text word = new Text();
20 
21     @Override
22     public void map(LongWritable longWritable, Text text, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
23         String line = text.toString();
24         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
25         while (tokenizer.hasMoreTokens()){
26             word.set(tokenizer.nextToken());
27             outputCollector.collect(word,one);
28         }
29     }
30 }

Reduce程序

 1 package com.zln.chapter03;
 2 
 3 import org.apache.hadoop.io.IntWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapred.MapReduceBase;
 6 import org.apache.hadoop.mapred.OutputCollector;
 7 import org.apache.hadoop.mapred.Reducer;
 8 import org.apache.hadoop.mapred.Reporter;
 9 
10 import java.io.IOException;
11 import java.util.Iterator;
12 
13 /**
14  * Created by sherry on 15-7-12.
15  */
16 public class WordCountReduce extends MapReduceBase implements Reducer<Text,IntWritable,Text,IntWritable> {
17     @Override
18     public void reduce(Text text, Iterator<IntWritable> iterator, OutputCollector<Text, IntWritable> outputCollector, Reporter reporter) throws IOException {
19         int sum = 0;
20         while (iterator.hasNext()){
21             sum += iterator.next().get();
22         }
23         outputCollector.collect(text,new IntWritable(sum));
24     }
25 }

主函数

 1 package com.zln.chapter03;
 2 
 3 import org.apache.hadoop.fs.Path;
 4 import org.apache.hadoop.io.IntWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapred.*;
 7 
 8 import java.io.IOException;
 9 
10 
11 /**
12  * Created by sherry on 15-7-12.
13  */
14 public class WordCount {
15     public static void main(String[] args) throws IOException {
16         JobConf conf = new JobConf(WordCount.class);
17         conf.setJobName("wordCount");
18 
19         //设置输出格式
20         conf.setOutputKeyClass(Text.class);
21         conf.setOutputValueClass(IntWritable.class);
22 
23         //设置MapReduce类
24         conf.setMapperClass(WordCountMap.class);
25         conf.setReducerClass(WordCountReduce.class);
26 
27         //设置处理输入类
28         conf.setInputFormat(TextInputFormat.class);
29         //设置处理输出类
30         conf.setOutputFormat(TextOutputFormat.class);
31 
32         FileInputFormat.setInputPaths(conf, new Path(args[0]));
33         FileOutputFormat.setOutputPath(conf, new Path(args[1]));
34 
35         JobClient.runJob(conf);
36     }
37 }

准备输入文件

file1

Hello Word By Word
Hello Word By zln

file2

Hello Hadoop
Hello GoodBye

放在同一个目录下：/home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备

编译class打成一个jar包

我使用IDEA进行编译。注意不要忘记指定main函数

上传输入文件

root@sherry:/opt/hadoop-1.2.1# hadoop fs -mkdir /user/root/zln/WordCount/InputFiles

root@sherry:/opt/hadoop-1.2.1# hadoop fs -put /home/sherry/IdeaProjects/Hadoop/WordCount/输入文件准备/* /user/root/zln/WordCount/InputFiles

上传jar并执行

root@sherry:/opt/hadoop-1.2.1# hadoop jar /home/sherry/IdeaProjects/Hadoop/out/artifacts/WordCount_jar/WordCount.jar /user/root/zln/WordCount/InputFiles /user/root/zln/WordCount/OutputFiles

查看执行结果

root@sherry:/opt/hadoop-1.2.1# hadoop fs -ls /user/root/zln/WordCount/OutputFiles
root@sherry:/opt/hadoop-1.2.1# hadoop fs -text /user/root/zln/WordCount/OutputFiles/part-00000

版本三：使用新版本的API对Map Reduce main函数进行重写

Map

 1 package com.zln.chapter03;
 2 
 3 import org.apache.hadoop.io.IntWritable;
 4 import org.apache.hadoop.io.LongWritable;
 5 import org.apache.hadoop.io.Text;
 6 import org.apache.hadoop.mapreduce.Mapper;
 7 
 8 import java.io.IOException;
 9 import java.util.StringTokenizer;
10 
11 /**
12  * Created by sherry on 15-7-12.
13  */
14 public class WordCountMap extends Mapper<LongWritable,Text,Text,IntWritable> {
15     private final static IntWritable one = new IntWritable(1);//每个单词 +1
16     private Text word = new Text();
17 
18 
19     @Override
20     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
21         String line = value.toString();
22         StringTokenizer tokenizer = new StringTokenizer(line);//分割出单词
23         while (tokenizer.hasMoreTokens()){
24             word.set(tokenizer.nextToken());
25             context.write(word,one);
26         }
27     }
28 
29 }

Reduce

 1 package com.zln.chapter03;
 2 
 3 import org.apache.hadoop.io.IntWritable;
 4 import org.apache.hadoop.io.Text;
 5 import org.apache.hadoop.mapreduce.Reducer;
 6 
 7 import java.io.IOException;
 8 
 9 /**
10  * Created by sherry on 15-7-12.
11  */
12 public class WordCountReduce extends Reducer<Text,IntWritable,Text,IntWritable> {
13 
14     @Override
15     protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
16         int sum = 0;
17         for (IntWritable intWritable:values){
18             sum += intWritable.get();
19         }
20         context.write(key,new IntWritable(sum));
21     }
22 }

Main

 1 package com.zln.chapter03;
 2 
 3 
 4 import org.apache.hadoop.conf.Configured;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.IntWritable;
 7 import org.apache.hadoop.io.Text;
 8 import org.apache.hadoop.mapreduce.Job;
 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
13 import org.apache.hadoop.util.Tool;
14 import org.apache.hadoop.util.ToolRunner;
15 
16 
17 
18 /**
19  * Created by sherry on 15-7-12.
20  */
21 public class WordCount extends Configured implements Tool{
22 
23     public int run(String[] args) throws Exception {
24         Job job = new Job(getConf());
25         job.setJarByClass(WordCount.class);
26         job.setJobName("WordCount");
27 
28 
29         job.setOutputKeyClass(Text.class);
30         job.setOutputValueClass(IntWritable.class);
31 
32         job.setMapperClass(WordCountMap.class);
33         job.setReducerClass(WordCountReduce.class);
34 
35         job.setInputFormatClass(TextInputFormat.class);
36         job.setOutputFormatClass(TextOutputFormat.class);
37 
38         FileInputFormat.setInputPaths(job,new Path(args[0]));
39         FileOutputFormat.setOutputPath(job,new Path(args[1]));
40 
41         boolean success = job.waitForCompletion(true);
42         return success?0:1;
43     }
44 
45     public static void main(String[] args) throws Exception {
46         int ret = ToolRunner.run(new WordCount(),args);
47         System.exit(ret);
48     }
49 }