Reducejoin sample

示例文件同sample join analysis
之前的示例是使用map端的join.这次使用reduce端的join.
根据源的类别写不同的mapper，处理不同的文件，输出的key都是studentno.value是其他的信息同时加上类别信息。
然后使用multipleinputs不同的路径注册不同的mapper.
reduce端相同的studentno的学生信息和考试成绩分配给同一个reduce,而且value中包含了这些信息，
把这些信息抽取出来，再做笛卡尔积即可。
下面的示例代码中，我没有使用multipleinputs来处理，自己修改了TextInputFormat的一些信息，使用返回文件名和当前行的信息。
根据文件名我在mapper中处理两个不同文件的信息，加上不同的类别送出去。
下面的代码中还有很多可以优化的地方，以后再更新。
package myexamples;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.LineReader;

public class reducejoin {

    public static class MyTextInputFormat extends FileInputFormat<Text, Text> {

        @Override
        public MyLineRecordReader createRecordReader(InputSplit split,
                TaskAttemptContext context) {
            return new MyLineRecordReader();
        }

        @Override
        protected boolean isSplitable(JobContext context, Path file) {
            CompressionCodec codec = new CompressionCodecFactory(
                    context.getConfiguration()).getCodec(file);
            return codec == null;
        }

    }

    public static class MyLineRecordReader extends RecordReader<Text, Text> {
        private static final Log LOG = LogFactory
                .getLog(LineRecordReader.class);

        private CompressionCodecFactory compressionCodecs = null;
        private long start;
        private long pos;
        private long end;
        private LineReader in;
        private int maxLineLength;
        private Text key = null;
        private Text value = null;

        Text filename = null;

        public void initialize(InputSplit genericSplit,
                TaskAttemptContext context) throws IOException {
            FileSplit split = (FileSplit) genericSplit;
            Configuration job = context.getConfiguration();
            this.maxLineLength = job.getInt(
                    "mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
            start = split.getStart();
            end = start + split.getLength();
            final Path file = split.getPath();
            key = new Text(file.getName());
            compressionCodecs = new CompressionCodecFactory(job);
            final CompressionCodec codec = compressionCodecs.getCodec(file);

            // open the file and seek to the start of the split
            FileSystem fs = file.getFileSystem(job);
            FSDataInputStream fileIn = fs.open(split.getPath());
            boolean skipFirstLine = false;
            if (codec != null) {
                in = new LineReader(codec.createInputStream(fileIn), job);
                end = Long.MAX_VALUE;
            } else {
                if (start != 0) {
                    skipFirstLine = true;
                    --start;
                    fileIn.seek(start);
                }
                in = new LineReader(fileIn, job);
            }
            if (skipFirstLine) { // skip first line and re-establish "start".
                start += in.readLine(new Text(), 0,
                        (int) Math.min((long) Integer.MAX_VALUE, end - start));
            }
            this.pos = start;
        }

        public boolean nextKeyValue() throws IOException {
            if (key == null) {

            }

            if (value == null) {
                value = new Text();
            }
            int newSize = 0;
            while (pos < end) {
                newSize = in.readLine(value, maxLineLength, Math.max(
                        (int) Math.min(Integer.MAX_VALUE, end - pos),
                        maxLineLength));
                if (newSize == 0) {
                    break;
                }
                pos += newSize;
                if (newSize < maxLineLength) {
                    break;
                }

                // line too long. try again
                LOG.info("Skipped line of size " + newSize + " at pos "
                        + (pos - newSize));
            }
            if (newSize == 0) {
                key = null;
                value = null;
                return false;
            } else {
                return true;
            }
        }

        @Override
        public Text getCurrentKey() {
            return key;
        }

        @Override
        public Text getCurrentValue() {
            return value;
        }

        /**
         * Get the progress within the split
         */
        public float getProgress() {
            if (start == end) {
                return 0.0f;
            } else {
                return Math.min(1.0f, (pos - start) / (float) (end - start));
            }
        }

        public synchronized void close() throws IOException {
            if (in != null) {
                in.close();
            }
        }
    }

    public static class studentMapper extends Mapper<Text, Text, Text, Text> {
        public void map(Text key, Text value, Context context)
                throws IOException, InterruptedException {
            Text newvalue = null;
            String strv = value.toString().substring(
                    value.toString().indexOf(","));
            if (key.toString().contains("student")) // student file
                newvalue = new Text("student" + strv);
            else
                newvalue = new Text("score" + strv);
            Text newkey = new Text(value.toString().substring(0,
                    value.toString().indexOf(",")));
            context.write(newkey, newvalue);
        }
    }

    public static class studentReducer extends Reducer<Text, Text, Text, Text> {
        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
            List<String> students = new ArrayList<String>();
            List<String> scores = new ArrayList<String>();
            for (Text value : values)
                if (value.toString().startsWith("student"))
                    students.add(value.toString().substring(8));
                else
                    scores.add(value.toString().substring(6));
            // split real results
            for (String student : students)
                for (String score : scores)
                    context.write(key, new Text(student + "," + score));
        }
    }

    public static void main(String[] args) throws Exception {
        args = "hdfs://namenode:9000/user/hadoop/student/ hdfs://namenode:9000/user/hadoop/reducejoinout"
                .split(" ");

        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }

        myUtils.myUtils.DeleteFolder(conf, otherArgs[1]);
        conf.set("io.sort.mb", "10");
        Job job = new Job(conf, "reduce join");
        job.setInputFormatClass(MyTextInputFormat.class);
        // job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setJarByClass(reducejoin.class);
        job.setMapperClass(studentMapper.class);
        job.setReducerClass(studentReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
Looking for a job working at Home about MSBI