Hadoop下进行反向索引(Inverted Index)操作

0.参考资料：

代码参考1：http://www.pudn.com/downloads212/sourcecode/unix_linux/detail999273.html

理论参考2：http://zhangyu8374.javaeye.com/blog/86307，http://nything.javaeye.com/blog/411787

1.分析

假如有file0，file1，file2三个文件，这些文件中都保存了一些文本内容，比如在file0中只有一个句子，内容为"we are happy"。一般的索引都是记录在这个文件中没有一个单词的索引号。比如file0的索引可以是(we,0),(are,1),(happy,2)。这样的键值对中key是单词，value是这个单词在这个文件中的位置。但是，反向索引刚好相反，对应于多个文件，我们要求出某一个单词在所有这些文件中出现的位置。我们可以按如下操作进行实验：

在本地创建文件夹IndexTest并在里面创建3个文件，每个文件中的内容如下。

    * T0 = "it is what it is"
    * T1 = "what is it"
    * T2 = "it is a banana"

其中T0，T1，T2分别是文件名，后面为文件内容。将IndexTest文件夹上传到DFS中。然后运行反向索引程序。反向索引程序见代码示例。

最后输出结果为：

a     (T2, 3)
banana     (T2, 4)
is     (T2, 2) (T0, 2) (T0, 5) (T1, 2)
it     (T1, 3) (T2, 1) (T0, 1) (T0, 4)
what     (T0, 3) (T1, 1)

2.代码示例

InvertedIndex.java

View Code

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package pa4;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
 *
 * @author Ming
 */
public class InvertedIndex {
    public static class TokenizerMapper
        extends Mapper<Text, ValuePair, Text, ValuePair> {
    @Override
    public void map(Text key, ValuePair value, Context context) throws IOException, InterruptedException {
    // TokenInputFormat has generate (word, (fileID, wordPosition))
    // so mapper just spill it to reducer
        key.set(key.toString().toLowerCase());
        context.write(key, value);
    }
    }
    public static class IndexReducer
        extends Reducer<Text, ValuePair, Text, Text> {
    private Text postings = new Text();
    @Override
    public void reduce(Text key, Iterable<ValuePair> values,
        Context context) throws IOException, InterruptedException {
        String list = "";
        for (ValuePair val : values) {
        list += " " + val.toString();
        }
        postings.set(list);
        context.write(key, postings);
    }
    }
    public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: InvertedIndex <in-dir> <out-dir>");
        System.exit(2);
    }
    // remove the old output dir
    FileSystem.get(conf).delete(new Path(otherArgs[1]), true);
    Job job = new Job(conf, "Inverted Indexer");
    job.setJarByClass(InvertedIndex.class);
    job.setInputFormatClass(TokenInputFormat.class);
    job.setMapperClass(InvertedIndex.TokenizerMapper.class);
    //job.setCombinerClass(InvertedIndex.IndexReducer.class);
    job.setReducerClass(InvertedIndex.IndexReducer.class);
    
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ValuePair.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

TokenInputFormat.java

View Code

package pa4;
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.util.LineReader;
import java.util.StringTokenizer;
public class TokenInputFormat extends FileInputFormat<Text, ValuePair> {
    /**
     * Don't allow the files to be split!
     */
    @Override
    protected boolean isSplitable(JobContext ctx, Path filename) {
    // ensure the input files are not splittable!
    return false;
    }
    /**
     * Just return the record reader
     * key is the docno
     */
    public RecordReader<Text, ValuePair> createRecordReader(InputSplit split,
        TaskAttemptContext ctx)
        throws IOException, InterruptedException {
    return new TokenRecordReader();
    }
    public static class TokenRecordReader extends RecordReader<Text, ValuePair> {
    private long start;
    private long pos;
    private long end;
    private LineReader in;
    private int maxLineLength;
    private Text line;
    private Text key = null;
    private ValuePair value = null;
    private StringTokenizer tokens = null;
    private int tokenPos = 0;
    private String fileID = "0";    // input file id that appears in inverted index
    public void initialize(InputSplit genericSplit,
        TaskAttemptContext context) throws IOException {
        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
            Integer.MAX_VALUE);
        start = split.getStart();
        end = start + split.getLength();
        final Path file = split.getPath();
        // Assume file name is an integer of file ID
        fileID = file.getName();
        FileSystem fs = file.getFileSystem(job);
        FSDataInputStream fileIn = fs.open(split.getPath());
        in = new LineReader(fileIn, job);
        this.pos = start;
        line = new Text();
        key = new Text();
        value = new ValuePair();
    }
    public boolean nextKeyValue() throws IOException {
        boolean splitEnds = false;
        while (tokens == null || !tokens.hasMoreTokens()) {
        int lineSize = in.readLine(line, maxLineLength,
            Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),
            maxLineLength));
        if (lineSize == 0) {
            splitEnds = true;
            break;
        }
        pos += lineSize;
        tokens = new StringTokenizer(line.toString(), " /t/n/r/f,.;<>-?///!'/":=*{}()$[]");
        }
        if (splitEnds) {
        key = null;
        value = null;
        line = null;
        tokens = null;
        return false;
        } else
        return true;
    }
    @Override
    public Text getCurrentKey() {
        key.set(tokens.nextToken());
        tokenPos ++;
        return key;
    }
    @Override
    public ValuePair getCurrentValue() {
        value.set(fileID, tokenPos);
        return value;
    }
    /**
     * Get the progress within the split
     */
    public float getProgress() {
        if (start == end) {
        return 0.0f;
        } else {
        return Math.min(1.0f, (pos - start) / (float) (end - start));
        }
    }
    public synchronized void close() throws IOException {
        if (in != null) {
        in.close();
        }
    }
    }
    public static void main(String[] args)
        throws IOException {
    String fn = args[0];
    Configuration conf = new Configuration();
    FileSplit split = new FileSplit(new Path(fn), 0, 10000000, null);
    TokenRecordReader irr = new TokenRecordReader();
    TaskAttemptContext ctx = new TaskAttemptContext(conf,
        new TaskAttemptID("hello", 12, true, 12, 12));
    irr.initialize(split, ctx);
    while (irr.nextKeyValue()) {
        System.out.println(irr.getCurrentKey() + ": " + irr.getCurrentValue());
    }
    }
}

ValuePair.java

View Code

package pa4;
/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
import java.io.*;
import org.apache.hadoop.io.*;
/**
 *
 * @author Ming
 */
public class ValuePair implements WritableComparable<ValuePair> {
    private Text one;
    private IntWritable two;
    public void set(Text first, IntWritable second) {
    one = first;
    two = second;
    }
    public void set(String first, int second) {
    one.set(first);
    two.set(second);
    }
    public ValuePair() {
    set(new Text(), new IntWritable());
    }
    public ValuePair(Text first, IntWritable second) {
    set(first, second);
    }
    public ValuePair(String first, int second) {
    set(first, second);
    }
    public Text getFirst() {
    return one;
    }
    public IntWritable getSecond() {
    return two;
    }
    @Override
    public void write(DataOutput out) throws IOException {
    one.write(out);
    two.write(out);
    }
    @Override
    public void readFields(DataInput in) throws IOException {
    one.readFields(in);
    two.readFields(in);
    }
    @Override
    public int hashCode() {
    return one.hashCode();
    }
    @Override
    public boolean equals(Object o) {
    if (o instanceof ValuePair) {
        ValuePair tp = (ValuePair)o;
        return one.equals(tp.one);
    }
    return false;
    }
    @Override
    public String toString() {
    return "(" + one + ", " + two + ")";
    }
    @Override
    public int compareTo(ValuePair tp) {
    int cmp = one.compareTo(tp.one);
    if (cmp != 0) {
        return cmp;
    }
    return two.compareTo(tp.two);
    }
    public static class Comparator extends WritableComparator {
    private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
    private static final IntWritable.Comparator INT_COMPARATOR = new IntWritable.Comparator();
    public Comparator() {
        super(ValuePair.class);
    }
    @Override
    public int compare(byte[] b1, int s1, int l1,
                byte[] b2, int s2, int l2) {
        try {
        int oneL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1);
        int oneL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2);
        int cmp = TEXT_COMPARATOR.compare(b1, s1, oneL1, b2, s2, oneL2);
        if (cmp != 0) {
            return cmp;
        }
        return INT_COMPARATOR.compare(b1, s1+oneL1, l1-oneL1,
                        b2, s2+oneL2, l2-oneL2);
        } catch (IOException e) {
        throw new IllegalArgumentException(e);
        }
    }
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        if (a instanceof ValuePair && b instanceof ValuePair) {
        return ((ValuePair) a).compareTo((ValuePair) b);
        }
        return super.compare(a, b);
    }
    }
    static {
    WritableComparator.define(ValuePair.class, new Comparator());
    }
}

ps：2012-5-20

这里键值对valuepair的运用让我想到了前几天写的Hashmap实现原理。在hashmap的实现过程中，也运用了键值对类Entry。两者之间有共通之处，有空可以再改进Hashmap实现原理。

作者：xwdreamer

出处：http://www.cnblogs.com/xwdreamer

欢迎任何形式的转载，但请务必注明出处。

分享到：