12作业MapReduce

1.mapreduce定义和优缺点？
MapReduce是一个分布式运算程序的编程框架，是用户开发“基于Hadoop的数据分析应用”的核心框架
MapReduce核心功能是将用户编写的业务逻辑代码和自带默认组件整合成一个完整的分布式运算程序，并发行在一个Hadoop集群上。
优点：
1）.MapReduce易于编程
它简单的实现一些接口，就可以完成一个分布式程序，这个分布式程序可以分布到大量廉价的PC机器上运行。
2）.良好的扩展性
当你的计算资源不能得到满足的时候，你可以通过简单的增加机器来扩展它的计算能力。
3）.高容错性
其中一台机器挂了，它可以把上面的计算任务转移到另一个节点上运行，不至于这个任务运行失败，而且这个过程不需要人工参与，而完全是由Hadoop内部完成的。
4）.适合PB级以上海量数据的离线处理
可以实现上千台服务器集群并发工作，提供数据处理能力。
缺点
1）不擅长实时计算
MapReduce无法像Mysql一样，在毫秒或秒级返回结果。
2）不擅长流式计算
流式计算的输入数据是动态的，而MapReduce的输入数据是静态的，不能动态变化。这是因为MapReduce自身的设计特点决定了数据源必须是静态的。
3）不擅长DAG计算
多个应用程序存在依赖关系，后一个应用程序的输入为前一个的输出。在这种情况下，MapReduce并不是不能做，而是使用后，每一个MapReduce作业的输出结果都会写入磁盘，会造成大量的磁盘IO，导致性能非常低下。
2.mapreduce的数据类型
Java类型   Hadoop Writable类型
Boolean        BooleanWritable
Byte        ByteWritable
Int            IntWritable
Float        FloatWritable
Long        LongWritable
Double        DoubleWritable
String        Text
Map            MapWritable
Array        ArrayWritable
3.查看官方的wordcount代码样例
    package com.huawei.hdfs;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.LongWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Mapper;

    import java.io.IOException;

    public class HWMapper extends Mapper<LongWritable,Text,Text,IntWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            Text keyout=new Text();
            IntWritable valueout=new IntWritable();
            //以空格隔断
            String[] arr=value.toString().split(" ");//用空格分开
            for(String s: arr){
                keyout.set(s);
                valueout.set(1);
                context.write(keyout,valueout);
            }

        }
    }

重写reducer

    package com.huawei.hdfs;

    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Reducer;

    import java.io.IOException;


    public class HWReducer extends org.apache.hadoop.mapreduce.Reducer<Text,IntWritable,Text,IntWritable> {

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count =0;
            for(IntWritable iw:values){
                count+=iw.get();
            }
            context.write(key,new IntWritable(count));
        }
    }

编写main函数

    package com.huawei.hdfs;


    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.io.IntWritable;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.mapreduce.Job;
    import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
    import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


    import java.io.IOException;

    public class HWAPP {
        public static void main(String[] args) throws Exception {
            Configuration conf=new Configuration();
            conf.set("fs.defaultFS","file:///");//本地需要这个，集群需要将这个注释掉
            Job job =Job.getInstance(conf);

            job.setJobName("HWAPP");                        //设置job名称


            job.setInputFormatClass(TextInputFormat.class);//设置输入格式
            FileInputFormat.addInputPath(job,new Path(args[0])); //设置输入路径
            FileOutputFormat.setOutputPath(job,new Path(args[1]));//设置输出路径

            job.setJarByClass(HWAPP.class);                //设置执行的class文件
            job.setMapperClass(HWMapper.class);
            job.setReducerClass(HWReducer.class);


            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setNumReduceTasks(1);                       //设置reduce的个数
            job.setOutputKeyClass(Text.class);              //设置输出的key格式
            job.setOutputValueClass(IntWritable.class);     //设置输出的value格式
            job.waitForCompletion(false);
        }
    }
4.自己实现wordcount代码
package com.jinghang.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
/**
* KEYIN:LongWritable(偏移量)
* VALUEIN：Text (文本中每一行的内容)
* KEYOUT: Text (某一个单词作为key)
* VALUEOUT：IntWritable （单词出现的个数）
*/
public class WcMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
//map输出的key值
    private Text keyText= new Text();
    //map输出的value值;
    private IntWritable one = new IntWritable(1);
    @Override
    protected void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException {
        //        super.map(key, value, context);
        //获取文件中的行数据
        String line = value.toString();
        String[] fileds = line.split(" ");
        //分割字符串(根据空格分割字符串)
        for(String filed : fileds){
            keyText.set(filed);
            context.write(keyText,one);
        }
    }

}

package com.jinghang.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
* KEYIN: Text map端的输出的key作为reduce的输入key
* VALUEIN: IntWritable map端的输出作为reduce的输入value
* KEYOUT: Text 以单词作为输出的key值
* VALUEOUT：IntWritable 统计单词出现的总数，作为输出的value
*/
public class WcReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable total = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//        super.reduce(key, values, context);
        int sum = 0; //统计单词出现的总数 {Text("hadoop"),IntWriter(1),Text("hadoop"),IntWriter(1),Text("hadoop"),IntWriter(1)}
        for (IntWritable value : values) {
            //累加，统计单词出现总次数
            sum += value.get();
        }
        total.set(sum);
        //key，value 写入到上下文中（context）
        context.write(key,total);
    }

}

package com.jinghang.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {
    public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
        //获取一个job实例
        Job job = Job.getInstance(new Configuration());

        //设置本程序的jar包类的路径
        job.setJarByClass(WcDriver.class);

        //设置map类和reduce类
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReducer.class);

        //设置map输出的key和value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //设置Reduce的输出的key和value类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置处理文本的输入和输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //提交job任务
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0:1);
    }
}