Hadoop 实现多文件输出

比如word.txt内容如下：

aaa bbb aba abc

bba bbd bbbc

cc ccd cce

要求按单词的首字母区分单词并分文件输出

代码如下：

LineRecordWriter

package com.hadoop.multi;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class LineRecordWriter<K, V> extends RecordWriter<K, V> {

	private static final String utf8 = "UTF-8";

	private static final byte[] newline;

	static {
		try {
			newline = "n".getBytes(utf8);
		} catch (UnsupportedEncodingException uee) {
			throw new IllegalArgumentException("can't find " + utf8
					+ " encoding");
		}
	}

	protected DataOutputStream out;
	private final byte[] keyValueSeparator;

	public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
		this.out = out;
		try {
			this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
		} catch (UnsupportedEncodingException uee) {
			throw new IllegalArgumentException("can't find " + utf8
					+ " encoding");
		}
	}

	public LineRecordWriter(DataOutputStream out) {
		this(out, "t");
	}

	private void writeObject(Object o) throws IOException {
		if (o instanceof Text) {
			Text to = (Text) o;
			out.write(to.getBytes(), 0, to.getLength());
		} else {
			out.write(o.toString().getBytes(utf8));
		}
	}

	public synchronized void write(K key, V value) throws IOException {
		boolean nullKey = key == null || key instanceof NullWritable;
		boolean nullValue = value == null || value instanceof NullWritable;
		if (nullKey && nullValue) {
			return;
		}
		if (!nullKey) {
			writeObject(key);
		}
		if (!(nullKey || nullValue)) {
			out.write(keyValueSeparator);
		}
		if (!nullValue) {
			writeObject(value);
		}
		out.write(newline);
	}

	public synchronized void close(TaskAttemptContext context)
			throws IOException {
		out.close();
	}

}

MultipleOutputFormat

package com.hadoop.multi;
   
import java.io.DataOutputStream;   
import java.io.IOException;   
import java.util.HashMap;   
import java.util.Iterator;   
import org.apache.hadoop.conf.Configuration;   
import org.apache.hadoop.fs.FSDataOutputStream;   
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.Writable;   
import org.apache.hadoop.io.WritableComparable;   
import org.apache.hadoop.io.compress.CompressionCodec;   
import org.apache.hadoop.io.compress.GzipCodec;   
import org.apache.hadoop.mapreduce.OutputCommitter;   
import org.apache.hadoop.mapreduce.RecordWriter;   
import org.apache.hadoop.mapreduce.TaskAttemptContext;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;   
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   
import org.apache.hadoop.util.ReflectionUtils; 

public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>
        extends FileOutputFormat<K, V> {
	
	private MultiRecordWriter writer = null; 
	
	public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,   
		   InterruptedException {   
		if (writer == null) {   
		    writer = new MultiRecordWriter(job, getTaskOutputPath(job));   
		}   
		return writer;   
	}
	
	private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {   
        Path workPath = null;   
        OutputCommitter committer = super.getOutputCommitter(conf);   
        if (committer instanceof FileOutputCommitter) {   
            workPath = ((FileOutputCommitter) committer).getWorkPath();   
        } else {   
            Path outputPath = super.getOutputPath(conf);   
            if (outputPath == null) {   
                throw new IOException("Undefined job output-path");   
            }   
            workPath = outputPath;   
        }   
        return workPath;   
    } 
	
	protected abstract String generateFileNameForKeyValue(K key, V value, Configuration conf);   

	public class MultiRecordWriter extends RecordWriter<K, V> {   
        
        private HashMap<String, RecordWriter<K, V>> recordWriters = null;   
        private TaskAttemptContext job = null;   
           
        private Path workPath = null;   
        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {   
            super();   
            this.job = job;   
            this.workPath = workPath;   
            recordWriters = new HashMap<String, RecordWriter<K, V>>();   
        }   
        @Override   
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {   
            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();   
            while (values.hasNext()) {   
                values.next().close(context);   
            }   
            this.recordWriters.clear();   
        }   
        @Override   
        public void write(K key, V value) throws IOException, InterruptedException {   
            //得到输出文件名   
            String baseName = generateFileNameForKeyValue(key, value, job.getConfiguration());   
            RecordWriter<K, V> rw = this.recordWriters.get(baseName);   
            if (rw == null) {   
                rw = getBaseRecordWriter(job, baseName);   
                this.recordWriters.put(baseName, rw);   
            }   
            rw.write(key, value);   
        }   
          
        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)   
                throws IOException, InterruptedException {   
            Configuration conf = job.getConfiguration();   
            boolean isCompressed = getCompressOutput(job);   
            String keyValueSeparator = ",";   
            RecordWriter<K, V> recordWriter = null;   
            if (isCompressed) {   
                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,   
                        GzipCodec.class);   
                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);   
                Path file = new Path(workPath, baseName + codec.getDefaultExtension());   
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);   
                recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec   
                        .createOutputStream(fileOut)), keyValueSeparator);   
            } else {   
                Path file = new Path(workPath, baseName);   
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);   
                recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);   
            }   
            return recordWriter;   
        }   
    }   
	
}

MultiFileOutPut

package com.hadoop.multi;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.hadoop.multi.MultipleOutputFormat;

public class MultiFileOutPut {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, IntWritable>{
    
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
      
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values, 
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }
  
  public static class AlphabetOutputFormat extends MultipleOutputFormat<Text, IntWritable> {   
      @Override   
      protected String generateFileNameForKeyValue(Text key, IntWritable value, Configuration conf) {   
          char c = key.toString().toLowerCase().charAt(0);   
          if (c >= 'a' && c <= 'z') {   
              return c + ".txt";   
          }   
          return "other.txt";   
      }   
  }  

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
      System.err.println("Usage: wordcount <in> <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(MultiFileOutPut.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(AlphabetOutputFormat.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}