《Hadoop实战》之专利统计

数据
需求一
- 思路
需求二
- 思路
需求三
- 思路
Hadoop的新API
- 基于新api重写的Hadoop基础程序模板

数据

专利数据的特性

专利引用数据所构成的关系图与网页链接以及社会网络图大同小异
专利发布以时间为序，特性类似于时间序列
专利关联到一个人和一个位置，可视为个人信息或地理数据

首先拿到专利数据：http://data.nber.org/patents/

本文使用是的cite75-99.txt，该文件涵盖了自1975年到1999年间对美国专利的引用，包含超过1600万条数据，前几行如下图：

其中第一列为专利号、第二列为被第一列引用的专利号

CITING	CITED
3858241	956203

需求一

读取专利引用数据，对于每一个专利找到哪些专列对他进行了引用并进行合并。
进行倒序排序

思路

针对1，将Mapper的键值互换输出即可
针对2，简单方法是设置reduce=1，直接输出全局排序文件

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;


// 读取专利引用数据，对于每一个专利找到哪些专列对他进行了引用并进行合并。
public class FindCitedPatentsAndOrder extends Configured implements Tool {

    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        @Override
        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            output.collect(value, key);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, Text> {

        @Override
        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output,
                           Reporter reporter) throws IOException {
            String csv = "";
            while (values.hasNext()) {
                if (csv.length() > 0) csv += ",";
                csv += values.next().toString();
            }
            output.collect(key, new Text(csv));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("FindCitedPatentsAndOrder");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        job.set("key.value.separator.in.input.line", ",");

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new FindCitedPatentsAndOrder(), args);
        System.exit(exitCode);
    }
}

需求二

计算不同引用次数专利的数目

思路

在FindCitedPatentsAndOrder的基础上修改Reduce函数，统计数目

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;

// 计算不同引用次数专利的数目
public class CitedPatentsNumberCounter extends Configured implements Tool {
    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, Text, Text> {

        @Override
        public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
                throws IOException {
            output.collect(value, key);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<Text, Text, Text, IntWritable> {

        @Override
        public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, IntWritable> output,
                           Reporter reporter) throws IOException {
            int count = 0;
            while (values.hasNext()){
                values.next();
                count++;
            }
            output.collect(key, new IntWritable(count));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("CitedPatentsNumberCounter");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);  // 同时设置K2,V2和K3,V3的类型

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new CitedPatentsNumberCounter(), args);
        System.exit(exitCode);
    }
}

需求三

在需求二输出的被引用数量数据的基础上，统计被引用数量的频次

思路

将输入数据替换成需求二的输出数据即可
为了重用Mapper里面的OutputKey和OutputValue，将它们在类中初始化

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;
import java.util.Iterator;

public class CitationFrequencyStatistics extends Configured implements Tool {
    public static class MapClass extends MapReduceBase implements Mapper<Text, Text, IntWritable, IntWritable> {
        private final  static IntWritable UNO = new IntWritable(1);  // 单位1
        private IntWritable citationCount = new IntWritable();


        @Override
        public void map(Text key, Text value, OutputCollector<IntWritable, IntWritable> output, Reporter reporter)
                throws IOException {
            citationCount.set(Integer.parseInt(value.toString()));
            output.collect(citationCount, UNO);  // 关键点
        }
    }

    public static class ReduceClass extends MapReduceBase implements Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {

        @Override
        public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<IntWritable, IntWritable> output,
                           Reporter reporter) throws IOException {
            int count = 0;
            while (values.hasNext()){
                values.next();
                count++;
            }
            output.collect(key, new IntWritable(count));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        JobConf job = new JobConf(getConf(), getClass());

        Path in = new Path(args[0]);
        Path out = new Path(args[1]);
        FileInputFormat.addInputPath(job, in);
        FileOutputFormat.setOutputPath(job, out);

        job.setJobName("CitationFrequencyStatistics");
        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormat(KeyValueTextInputFormat.class);
        job.setOutputFormat(TextOutputFormat.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);  // 同时设置K2,V2和K3,V3的类型

        JobClient.runJob(job);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new CitationFrequencyStatistics(), args);
        System.exit(exitCode);
    }
}

Hadoop的新API

Hadoop最新版本的MapReduce Release 0.20.0的API包括了一个全新的Mapreduce JAVA API，有时候也称为上下文对象。

　　新的API类型上不兼容以前的API，所以，以前的应用程序需要重写才能使新的API发挥其作用。

　　新的API和旧的API之间有下面几个明显的区别。

新的API倾向于使用抽象类，而不是接口，因为这更容易扩展。例如，你可以添加一个方法(用默认的实现)到一个抽象类而不需修改类之前的实现方法。在新的API中，Mapper和Reducer是抽象类。
新的API是在org.apache.hadoop.mapreduce包(和子包)中的。之前版本的API则是放在org.apache.hadoop.mapred中的。
新的API广泛使用context object(上下文对象)，并允许用户代码与MapReduce系统进行通信。例如，MapContext基本上充当着JobConf的OutputCollector和Reporter的角色。
新的API同时支持"推"和"拉"式的迭代。在这两个新老API中，键/值记录对被推mapper中，但除此之外，新的API允许把记录从map()方法中拉出，这也适用于reducer。"拉"式的一个有用的例子是分批处理记录，而不是一个接一个。
新的API统一了配置。旧的API有一个特殊的JobConf对象用于作业配置，这是一个对于Hadoop通常的Configuration对象的扩展。在新的API中，这种区别没有了，所以作业配置通过Configuration来完成。作业控制的执行由Job类来负责，而不是JobClient，它在新的API中已经荡然无存。

基于新api重写的Hadoop基础程序模板

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;

import java.io.IOException;

public class MyJob extends Configured implements Tool {

    public static class MapClass extends Mapper<LongWritable, Text, Text, Text> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] citation = value.toString().split(",");
            context.write(new Text(citation[1]), new Text(citation[0]));
        }
    }

    public static class ReduceClass extends Reducer<Text, Text, Text, Text> {
        @Override
        protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            String csv = "";
            for (Text val: values) {
                if (csv.length() > 0) csv += ",";
                csv += val.toString();
            }
            context.write(key, new Text(csv));
        }
    }

    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = getConf();

        Job job = new Job(conf, "Myjob");
        job.setJarByClass(MyJob.class);

        job.setMapperClass(MapClass.class);
        job.setReducerClass(ReduceClass.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        return 0;
    }
}