Mapreduce如何实现自己的InputFormat

在mapreduce程序运行的开始阶段，hadoop需要将待处理的输入文件进行分割，按预定义的格式对文件读取等操作，这些操作都在InputFormat中进行。主要工作有以下3个：

1. Validate the input-specification of the job.

2. Split-up the input file(s) into logical InputSplits, each of which is then assigned to an individual Mapper.

3. Provide the RecordReader implementation to be used to glean input records from the logical InputSplit for processing by the Mapper.

InputFormat是一个抽象类，他含有getSplits()和createRecordReader()抽象方法，在子类中必须被实现。这两个就是InputFormat的基本方法。getSplits()确定输入对象的切分原则，而createRecordReader()则可以按一定格式读取相应数据。通常默认情况下，不是直接实现InputFormat类，而是直接继承FileInputFormat类，这个类提供了很多对文件操作的方法，其中比较常用的就是isSpiitable（）方法，该方法决定该文件是否进行分片操作。另外还有就是createRecordReader方法，该方法是为文件的分片定制一个recordreader，可以根据自己的需求来进行定制，只需要重写该函数。

下面我就以http://developer.yahoo.com/hadoop/tutorial/module5.html#types中的例子来实现自己的MyInputFormat，根据自己的需求定制自己的InputFormat。

比如数据格式如下：

ball  3.5,12.7,9.0
car   15,23.76,42.23
device 0.0,12.4,-67.1

下面我们以这样的一种形式读取数据，分割每一行数据，前面的比如ball作为key，后面的3个浮点数读入到Point3D对象中，那么该如何实现呢？以下是我在学习过程中的实现。

首先是对Point3D数据类型的定制，首先定制数据类型为了能够在网络中以流的形式进行传输，必须实现Writable接口，同时在mapreduce编程的过程中，需要根据key来对数据进行排序与分区，所以必须实现Writable接口，因此就实现了一个更加高级的接口Writableomprable，同时可以满足上面两个要求。

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;


public class Point3D implements WritableComparable{
    public float x;
    public float y;
    public float z;
    public Point3D(float x, float y, float z) {
        super();
        this.x = x;
        this.y = y;
        this.z = z;
    }
    public Point3D(){
        this(0.0f,0.0f,0.0f);
    }
    public void set(float x,float y,float z){
        this.x=x;
        this.y=y;
        this.z=z;
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        // TODO Auto-generated method stub
        x=in.readFloat();
        y=in.readFloat();
        z=in.readFloat();
        
    }
    @Override
    public void write(DataOutput out) throws IOException {
        // TODO Auto-generated method stub
        out.writeFloat(x);
        out.writeFloat(y);
        out.writeFloat(z);
        
        
    }
    public float distanceFromOrigin(){
        return (float)Math.sqrt(x*x+y*y+z*z);
        
    }
    @Override
    public boolean equals(Object obj) {
        // TODO Auto-generated method stub
        if(!(obj instanceof Point3D))
                return false;
        Point3D other=(Point3D)obj;
        return this.x==other.x&&this.y==other.y&&this.z==other.z;
        
    }
    @Override
    public int hashCode() {
        // TODO Auto-generated method stub
        return Float.floatToIntBits(x)
                 ^ Float.floatToIntBits(y)
                 ^ Float.floatToIntBits(z);
    }
    @Override
    public String toString() {
        // TODO Auto-generated method stub
        return Float.toString(x)+","+Float.toString(y)+","+Float.toString(z);
    }
    @Override
    public int compareTo(Object ot) {
        // TODO Auto-generated method stub
        Point3D other=(Point3D)ot;
        float myDistance=this.distanceFromOrigin();
        float otherDistance=other.distanceFromOrigin();
        return Float.compare(myDistance, otherDistance);
        
    }
    
}

其次就是定制自己的Fortmat了

import java.io.IOException;

import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;


public class MyInputFormat extends FileInputFormat<Text, Point3D> {

    
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        // TODO Auto-generated method stub
        return false;
    }
    @Override
    public RecordReader<Text, Point3D> createRecordReader(InputSplit inputsplit,
            TaskAttemptContext context) throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        return new objPosRecordReader();
    }
    public static class objPosRecordReader extends RecordReader<Text,Point3D>{

        public LineReader in;
        public Text lineKey;
        public Point3D lineValue;
        public StringTokenizer token=null;
        
        public Text line;
        

        
        @Override
        public void close() throws IOException {
            // TODO Auto-generated method stub
            
        }

        @Override
        public Text getCurrentKey() throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            System.out.println("key");
            //lineKey.set(token.nextToken());
            System.out.println("hello");
            return lineKey;
        }

        @Override
        public Point3D getCurrentValue() throws IOException,
                InterruptedException {
            // TODO Auto-generated method stub
            
            return lineValue;
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            return 0;
        }

        @Override
        public void initialize(InputSplit input, TaskAttemptContext context)
                throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            FileSplit split=(FileSplit)input;
            Configuration job=context.getConfiguration();
            Path file=split.getPath();
            FileSystem fs=file.getFileSystem(job);
            
            FSDataInputStream filein=fs.open(file);
            in=new LineReader(filein,job);
            
            line=new Text();
            lineKey=new Text();
            lineValue=new Point3D();
            
            
            
            
            
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            // TODO Auto-generated method stub
            int linesize=in.readLine(line);
            if(linesize==0)
                return false;
            
            token=new StringTokenizer(line.toString());
            String []temp=new String[2];
            if(token.hasMoreElements()){
                temp[0]=token.nextToken();
                if(token.hasMoreElements()){
                    temp[1]=token.nextToken();
                }
            }
            System.out.println(temp[0]);
            System.out.println(temp[1]);
            String []points=temp[1].split(",");
            System.out.println(points[0]);
            System.out.println(points[1]);
            System.out.println(points[2]);
            lineKey.set(temp[0]);
            lineValue.set(Float.parseFloat(points[0]),Float.parseFloat(points[1]), Float.parseFloat(points[2]));
            System.out.println("pp");
            return true;
        }
        
    }

}

测试的时候写的map函数，没有reudce，必须要在后面设置job的时候设置reduce的个数为0，job.setNumReduceTasks(0)。

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class TestMapper extends Mapper<Text, Point3D, Text, Point3D> {

    @Override
    protected void map(Text key, Point3D value,
            org.apache.hadoop.mapreduce.Mapper.Context context)
            throws IOException, InterruptedException {
        // TODO Auto-generated method stub
        context.write(key, value);
    }
    
}
import java.io.IOException;
import java.net.URI;

import javax.xml.soap.Text;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class TestMyInputFormat {

    /**
     * @param args
     * @throws IOException 
     * @throws ClassNotFoundException 
     * @throws InterruptedException 
     */
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        // TODO Auto-generated method stub
        System.out.println("nihao");
        Job job=new Job();
        Configuration conf=new Configuration();
        FileSystem fs=FileSystem.get(URI.create(args[1]), conf);
        fs.delete(new Path(args[1]));
        job.setJobName("测试MyInputFormat程序。。。。。");
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.setInputFormatClass(MyInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Point3D.class);
        job.setMapperClass(TestMapper.class);
        job.setNumReduceTasks(0);
        job.waitForCompletion(false);
        
        
        
    }

}