Map Join案例

一、需求分析

1、需求

与Reduce join的需求一致

2、分析

a、在mapper的setup加载缓存，设置 kv 键值对

b、在map()方法中根据pid 获取 panme 根据上面的k v

c、写driver，设置reducenum为 0 ，使用缓存文件

二、代码

1、Driver

package com.wt.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class TableJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        // 0 根据自己电脑路径重新配置
        args = new String[]{"E:\a\input1\order.txt", "E:\a\output2"};
        // 1 获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        // 2 设置加载jar包路径
        job.setJarByClass(TableJoinDriver.class);
        // 3 关联map
        job.setMapperClass(TableJoinMapper.class);
        // 4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        // 5 设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 6 加载缓存数据
        job.addCacheFile(new URI("file:///E:/a/inputmap/pd.txt"));
        // 7 Map端Join的逻辑不需要Reduce阶段，设置reduceTask数量为0
        job.setNumReduceTasks(0);
        // 8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

2、Mapper

package com.wt.mapjoin;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.commons.lang.StringUtils;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class TableJoinMapper extends Mapper<LongWritable,Text,Text, NullWritable> {
    Map<String, String> pdMap = new HashMap<String, String>();
    Text k = new Text();
    String line;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
        // 1 获取缓存的文件
        URI[] cacheFiles = context.getCacheFiles();
        String path = cacheFiles[0].getPath().toString();
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path), "UTF-8"));
        while(StringUtils.isNotEmpty(line = reader.readLine())){
            // 2 切割
            String[] fields = line.split("	");
            // 3 缓存数据到集合
            pdMap.put(fields[0], fields[1]);
        }
        reader.close();
//        pid    pname
//        01    小米
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split("	");
//        id    pid    amount
//        1001    01    1
//  目标  id    pname    amount
        String id = fields[0];
        String pId = fields[1];
        String amount = fields[2];
        String pName = pdMap.get(pId);
        String newLine = id + "	" + pName + "	" + amount;
        k.set(newLine);
        context.write(k, NullWritable.get());
    }
}