hadoop map端join

   map端的联结比reduce端的联结实现起来复杂,而且限制也多,一般我们将小表置于内存中, 对于大表的一个纪录我们在内存中查找即可。

   改例子摘自hadoop基础教程, 我们实现sales和accounts的联结, 其中sales记录的顾客的销售信息,accounts纪录的是用户的账户信息,我们的目的是统计每个用户消费的次数和消费总额。

  数据如下:

  sales.txt

  

002 12.29   2004-07-02
004 13.42   2005-12-20
003 499.99  2010-12-20
001 78.95   2012-04-02
002 21.99   2006-11-30
002 93.45   2008-09-10
001 9.99    2012-05-17

  accounts.txt

002 Abigail SmithPremium    2004-07-13
003 April StevensStandard   2010-12-20
004 Nasser HafezPremium 2001-04-23

代码如下:  

import java.io.*;
import java.util.*;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MapJoin {
	public static class MapJoinMapper extends Mapper<Object, Text, Text, Text> {
		public Map<String, String> joinData = new HashMap();
		//执行连接操作
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException{
			String[] values = value.toString().split("	");
			context.write(new Text(joinData.get(values[0])), value);
		}
		//加载小表
		public void setup(Context context) throws IOException, InterruptedException{
			Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration());
			BufferedReader reader = new BufferedReader(new FileReader(path[0].toString()));
			String str = null;
			while((str = reader.readLine()) != null) {
				String[] s = str.split("	");
				joinData.put(s[0], s[1]);
			}	
		}
	}
	
	public static class MapJoinReducer extends Reducer<Text, Text, Text, Text> {
		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{
			int ci = 0;
			double total = 0.0;
			for(Text val : values) {
				ci ++;
				String[] v = val.toString().split("	");
				total += Float.parseFloat(v[1]);
			}
			String str = String.format("%d	%f", ci, total);
			context.write(key, new Text(str));
		}
	}	

	public static void main(String[] args) throws Exception{
		Configuration conf = new Configuration();
		DistributedCache.addCacheFile(new Path(args[1]).toUri(), conf);
		
		Job job = new Job(conf, "MapJoin");
		//设置相关类
		job.setJarByClass(MapJoin.class);
		job.setMapperClass(MapJoinMapper.class);
		job.setReducerClass(MapJoinReducer.class);

		//设置map输出格式
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		//设置输入输出文件
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[2]));

		//等待作业执行完毕
		System.exit(job.waitForCompletion(true)?0:1);
	}
}

  

原文地址:https://www.cnblogs.com/xingxing1024/p/7466262.html