MapReduce的几个实现

1.倒排索引的实现

 1 import java.io.IOException;
 2 import java.util.StringTokenizer;
 3 
 4 import org.apache.hadoop.conf.Configuration;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.Text;
 7 import org.apache.hadoop.mapreduce.Job;
 8 import org.apache.hadoop.mapreduce.Mapper;
 9 import org.apache.hadoop.mapreduce.Reducer;
10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
12 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
13 
14 
15 public class InvertedIndex {
16     
17     public static class InvertedIndexMap extends Mapper<Object,Text,Text,Text>{
18         
19         private Text valueInfo = new Text();
20         private Text keyInfo = new Text();
21         private FileSplit split;
22         
23         public void map(Object key, Text value,Context context)
24                 throws IOException, InterruptedException {
25             //获取<key value>对所属的FileSplit对象
26             split = (FileSplit) context.getInputSplit();
27             StringTokenizer stk = new StringTokenizer(value.toString());
28             while (stk.hasMoreElements()) {
29                 //key值由（单词：URI）组成
30                 keyInfo.set(stk.nextToken()+":"+split.getPath().toString());
31                 //词频
32                 valueInfo.set("1");
33                 context.write(keyInfo, valueInfo);                
34             }           
35         }
36     } 
37     
38     public static class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text>{
39         
40         Text info = new Text();
41 
42         public void reduce(Text key, Iterable<Text> values,Context contex)
43                 throws IOException, InterruptedException {
44             int sum = 0;
45             for (Text value : values) {
46                 sum += Integer.parseInt(value.toString());
47             }            
48             int splitIndex = key.toString().indexOf(":");
49             //重新设置value值由（URI+:词频组成）
50             info.set(key.toString().substring(splitIndex+1) +":"+ sum);
51             //重新设置key值为单词
52             key.set(key.toString().substring(0,splitIndex));
53             contex.write(key, info);
54         }
55     }
56     
57     public static class InvertedIndexReduce extends Reducer<Text,Text,Text,Text>{
58         
59         private Text result = new Text();
60         
61         public void reduce(Text key, Iterable<Text> values,Context contex)
62                 throws IOException, InterruptedException {
63             //生成文档列表
64             String fileList = new String();
65             for (Text value : values) {
66                 fileList += value.toString()+";";
67             }
68             result.set(fileList);
69             contex.write(key, result);
70         }
71     }
72     
73     public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
74         
75         Configuration conf = new Configuration();
76         
77         Job job = new Job(conf,"InvertedIndex");
78         
79         job.setJarByClass(InvertedIndex.class);
80         
81         job.setMapperClass(InvertedIndexMap.class);
82         job.setMapOutputKeyClass(Text.class);
83         job.setMapOutputValueClass(Text.class);
84         
85         job.setCombinerClass(InvertedIndexCombiner.class);
86         
87         job.setReducerClass(InvertedIndexReduce.class);
88         job.setOutputKeyClass(Text.class);
89         job.setOutputValueClass(Text.class);
90         
91         FileInputFormat.addInputPath(job, new Path("./in/invertedindex/"));
92         FileOutputFormat.setOutputPath(job, new Path("./out/"));
93         
94         System.exit(job.waitForCompletion(true)?0:1);
95         
96         
97     }
98 }

View Code

2.word count

 1 import java.io.IOException;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.fs.Path;
 5 import org.apache.hadoop.io.IntWritable;
 6 import org.apache.hadoop.mapreduce.Job;
 7 import org.apache.hadoop.mapreduce.Mapper;
 8 import org.apache.hadoop.mapreduce.Reducer;
 9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
11 import org.apache.hadoop.util.GenericOptionsParser;
12 
13 public class WordCount {
14     public static class WordMapper extends Mapper<Object, String, String, IntWritable> {
15         private static final IntWritable one = new IntWritable(1);
16         public void map(Object key, String value, Context context) throws IOException, InterruptedException {
17             String[] words = value.split(" ");
18             for (String word : words) {
19                 context.write(word, one);
20             }
21         }        
22     }
23     public static class WordReducer extends Reducer<String, Iterable<IntWritable>, String, IntWritable> {
24         private static IntWritable ans = new IntWritable();
25         public void reduce(String key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException {
26             int sum = 0;            
27             for (IntWritable count : value) {
28                 sum += count.get();                
29             }
30             ans.set(sum);
31             context.write(key, ans);
32         }
33     }
34     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
35         Configuration conf = new Configuration();
36         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
37         if (otherArgs.length != 2) {
38             System.err.println("Usage: wordCount <int> <count>");
39             System.exit(2);
40         }
41         Job job = new Job(conf, "word count");
42         job.setJarByClass(WordCount.class);
43         job.setMapperClass(WordMapper.class);
44         job.setCombinerClass(WordReducer.class);
45         job.setReducerClass(WordReducer.class);
46         job.setOutputKeyClass(String.class);
47         job.setOutputValueClass(IntWritable.class);
48         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
49         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
50         System.exit(job.waitForCompletion(true)?0:1);
51     }
52 }

View Code

3.找出访问量最多的表

  1 import org.apache.commons.lang.StringUtils;  
  2 import org.apache.hadoop.conf.Configuration;  
  3 import org.apache.hadoop.fs.Path;  
  4 import org.apache.hadoop.io.LongWritable;  
  5 import org.apache.hadoop.io.Text;  
  6 import org.apache.hadoop.mapreduce.Job;  
  7 import org.apache.hadoop.mapreduce.Mapper;  
  8 import org.apache.hadoop.mapreduce.Reducer;  
  9 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 10 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 11   
 12   
 13   
 14 /** 
 15  
 16  * 用Hadoop分析海量日志文件，每行日志记录了如下数据： 
 17  
 18  * TableName(表名)，Time(时间)，User(用户)，TimeSpan(时间开销) 
 19  
 20  * 要求编写MapReduce程序算出高峰时间段（如9-10点）哪张表被访问的最频繁 
 21  
 22  * 以及这段时间访问这张表最多的用户，以及这个用户访问这张表的总时间开销。 
 23  
 24  * @author drguo 
 25  
 26  *t003 6:00 u002 180 
 27  
 28  *t003 7:00 u002 180 
 29  
 30  *t003 7:08 u002 180 
 31  
 32  *t003 7:25 u002 180 
 33  
 34  *t002 8:00 u002 180 
 35  
 36  *t001 8:00 u001 240 
 37  
 38  *t001 9:00 u002 300 
 39  
 40  *t001 9:11 u001 240 
 41  
 42  *t003 9:26 u001 180 
 43  
 44  *t001 9:39 u001 300 
 45  
 46  * 
 47  
 48  * 
 49  
 50  * 先找出9-10点访问量最大的表 
 51  
 52  * 
 53  
 54  */  
 55 //club.drguo.xx.mapreduce.tablecount.TableCount  
 56 public class TableCount {  
 57     public static class TableCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{  
 58         private Text k = new Text();  
 59         @Override  
 60         protected void map(LongWritable key, Text value, Context context)  
 61                 throws IOException, InterruptedException {  
 62             String line = value.toString();  
 63             String[] strings = StringUtils.split(line, " ");  
 64             String tabName = strings[0];  
 65             String time = strings[1];  
 66             String[] times = time.split(":");  
 67             int hour = Integer.parseInt(times[0]);  
 68             k.set(tabName);  
 69             if(hour==9){  
 70                 context.write(k, new LongWritable(1));  
 71                 System.out.println("-----------------------------------------------"+k);  
 72             }  
 73         }  
 74     }  
 75     public static class TableCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{  
 76         private TreeMap<Text, Long> map = new TreeMap<Text, Long>();  
 77         @Override  
 78         protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {  
 79             Text tabName = new Text(key.toString());//不要直接Text tabName = key;  
 80             long count = 0;  
 81             for(LongWritable value : values){  
 82                 count += value.get();  
 83             }  
 84             System.out.println(tabName+"--------------------------"+count);  
 85             map.put(tabName, count);  
 86         }  
 87         @Override  
 88         protected void cleanup(Reducer<Text, LongWritable, Text, LongWritable>.Context context)  
 89                 throws IOException, InterruptedException {  
 90             Text tableName = null;  
 91             Long maxCount = 0L;  
 92             for(Text key : map.keySet()){  
 93                 System.out.println("key="+key+"-----------------value="+map.get(key));  
 94                 while(map.get(key)>maxCount){  
 95                     maxCount = map.get(key);  
 96                     tableName = key;  
 97                 }  
 98             }  
 99             context.write(tableName, new LongWritable(maxCount));  
100         }  
101     }  
102     public static void main(String[] args) throws Exception {  
103         Configuration configuration = new Configuration();  
104         Job job = Job.getInstance(configuration,"tablejob");  
105         job.setJarByClass(TableCount.class);  
106           
107         job.setMapperClass(TableCountMapper.class);  
108         job.setReducerClass(TableCountReducer.class);  
109           
110         job.setMapOutputKeyClass(Text.class);  
111         job.setMapOutputValueClass(LongWritable.class);  
112           
113         job.setOutputKeyClass(Text.class);  
114         job.setOutputValueClass(LongWritable.class);  
115           
116         FileInputFormat.setInputPaths(job, "hdfs://localhost:9000/log");  
117         FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/tablecount"));  
118           
119         System.exit(job.waitForCompletion(true)?0:1);  
120     }  
121 }

View Code