mapreduce学习笔记一:WordCound

MapReduce采用的是“分而治之”的思想,把对大规模数据集的操作,分发给一个主节点管理下的各个从节点共同完成,然后通过整合各个节点的中间结果,得到最终结果。简单来说,MapReduce就是”任务的分解与结果的汇总“。

1.MapReduce的工作原理

在分布式计算中,MapReduce框架负责处理了并行编程里分布式存储、工作调度,负载均衡、容错处理以及网络通信等复杂问题,现在我们把处理过程高度抽象为Map与Reduce两个部分来进行阐述,其中Map部分负责把任务分解成多个子任务,Reduce部分负责把分解后多个子任务的处理结果汇总起来,具体设计思路如下。

(1)Map过程需要继承org.apache.hadoop.mapreduce包中Mapper类,并重写其map方法。通过在map方法中添加两句把key值和value值输出到控制台的代码,可以发现map方法中输入的value值存储的是文本文件中的一行(以回车符为行结束标记),而输入的key值存储的是该行的首字母相对于文本文件的首地址的偏移量。然后用StringTokenizer类将每一行拆分成为一个个的字段,把截取出需要的字段(本实验为买家id字段)设置为key,并将其作为map方法的结果输出。

(2)Reduce过程需要继承org.apache.hadoop.mapreduce包中Reducer类,并重写其reduce方法。Map过程输出的<key,value>键值对先经过shuffle过程把key值相同的所有value值聚集起来形成values,此时values是对应key字段的计数值所组成的列表,然后将<key,values>输入到reduce方法中,reduce方法只要遍历values并求和,即可得到某个单词的总次数。

在main()主函数中新建一个Job对象,由Job对象负责管理和运行MapReduce的一个计算任务,并通过Job的一些方法对任务的参数进行相关的设置。本实验是设置使用将继承Mapper的doMapper类完成Map过程中的处理和使用doReducer类完成Reduce过程中的处理。还设置了Map过程和Reduce过程的输出类型:key的类型为Text,value的类型为IntWritable。任务的输出和输入路径则由字符串指定,并由FileInputFormat和FileOutputFormat分别设定。完成相应任务的参数设定后,即可调用job.waitForCompletion()方法执行任务,其余的工作都交由MapReduce框架处理。

2.java api的文件源码:

首先需要自己在hdfs中上传文件:(可以参开上一篇文件读写的api)

1(这个代码是统计某一列的数量)

package mapreduce;  

import java.io.IOException;  

import java.util.StringTokenizer;


import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;  

import org.apache.hadoop.io.IntWritable;  

import org.apache.hadoop.io.Text;  

import org.apache.hadoop.mapreduce.Job;  

import org.apache.hadoop.mapreduce.Mapper;  

import org.apache.hadoop.mapreduce.Reducer;  

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  

public class WordCount {  

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf=new Configuration(); 

        conf.set("dfs.client.use.datanode.hostname", "true");

        @SuppressWarnings("deprecation")

        Job job =new Job(conf,"filter"); 

        job.setJobName("WordCount");  

        job.setJarByClass(WordCount.class);  

        job.setMapperClass(doMapper.class);  

        job.setReducerClass(doReducer.class);  

        job.setOutputKeyClass(Text.class);  

        job.setOutputValueClass(IntWritable.class);  

        Path in=new Path("hdfs://*自己的hdfs地址*:9000/user/hadoop/input/b.txt");  

        Path out=new Path("hdfs://*自己的hdfs地址*:9000/user/hadoop/output"); 

        Path path = new Path("hdfs://*自己的hdfs地址*:9000/user/hadoop/output");

        FileSystem fileSystem = path.getFileSystem(conf);// 根据path找到这个文件

        if (fileSystem.exists(path)) {

            fileSystem.delete(path, true);// true的意思是,就算output有东西,也一带删除

        }

        FileInputFormat.addInputPath(job, in);  

        FileOutputFormat.setOutputPath(job, out);  

        System.exit(job.waitForCompletion(true) ? 0 : 1);  

    }  

    public static class doMapper extends Mapper<Object, Text, Text, IntWritable>{ 

        @SuppressWarnings("unused")

        private static Text newKey=new Text();

        public static final IntWritable one = new IntWritable(1);  

        public static Text word = new Text();  

        @Override  

        protected void map(Object key, Text value, Context context)  

                    throws IOException, InterruptedException { 

            String line=value.toString();

            String arr[]=line.split("   ");

            newKey.set(arr[0]);  

            context.write(newKey , one);         

        }  

    }  

    public static class doReducer extends Reducer<Text, IntWritable, Text, IntWritable>{  

        private IntWritable result = new IntWritable();  

        @Override  

        protected void reduce(Text key, Iterable<IntWritable> values, Context context)  

        throws IOException, InterruptedException {  

        int sum = 0;

        for (IntWritable value : values) {  

        sum += value.get();  

        }  

        result.set(sum); 

        context.write(key, result);  

        }  

    }  

}  

2.这个是所有的字符的:

package mapreduce;  
import java.io.IOException;  
import java.util.StringTokenizer;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.IntWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
public class WordCount {  
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
        Configuration conf=new Configuration(); 

        conf.set("dfs.client.use.datanode.hostname", "true");

        @SuppressWarnings("deprecation")

        Job job =new Job(conf,"filter");   
        job.setJobName("WordCount");  
        job.setJarByClass(WordCount.class);  
        job.setMapperClass(doMapper.class);  
        job.setReducerClass(doReducer.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(IntWritable.class);  
        Path in = new Path("hdfs://**/mymapreduce1/in/buyer_favorite1");  
        Path out = new Path("hdfs://**/mymapreduce1/out");
Path path = new Path("hdfs://*自己的hdfs地址*:9000/user/hadoop/output"); FileSystem fileSystem = path.getFileSystem(conf);// 根据path找到这个文件 if (fileSystem.exists(path)) { fileSystem.delete(path, true);// true的意思是,就算output有东西,也一带删除 } FileInputFormat.addInputPath(job, in); FileOutputFormat.setOutputPath(job, out); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class doMapper extends Mapper<Object, Text, Text, IntWritable>{ public static final IntWritable one = new IntWritable(1); public static Text word = new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer tokenizer = new StringTokenizer(value.toString(), " "); word.set(tokenizer.nextToken()); context.write(word, one); } } public static class doReducer extends Reducer<Text, IntWritable, Text, IntWritable>{ private IntWritable result = new IntWritable(); @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable value : values) { sum += value.get(); } result.set(sum); context.write(key, result); } } }

遇到的一些错误以及解决办法:

(1)

Exception in thread "main" org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory hdfs://*河蟹*/user/hadoop/output already exists
    at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:146)
    at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:266)
    at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:139)
    at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1290)
    at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1287)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1698)
    at org.apache.hadoop.mapreduce.Job.submit(Job.java:1287)
    at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1308)
    at mapreduce.WordCount.main(WordCount.java:36)



错误原因:output文件夹存在,

解决方法:

把它删除就可以了。也可以加一个判断:

 Path path = new Path("hdfs://*自己的hdfs地址*:9000/user/hadoop/output");// 取第1个表示输出目录参数(第0个参数是输入目录)
        FileSystem fileSystem = path.getFileSystem(conf);// 根据path找到这个文件
        if (fileSystem.exists(path)) {
            fileSystem.delete(path, true);// true的意思是,就算output有东西,也一带删除
        }

 (2)

java.net.ConnectException: Connection timed out: no further information
	at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
	at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
	at org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)
	at org.apache.hadoop.net.NetUtils.connect(NetUtils.java:531)
	at org.apache.hadoop.hdfs.DFSClient.newConnectedPeer(DFSClient.java:3436)
	at org.apache.hadoop.hdfs.BlockReaderFactory.nextTcpPeer(BlockReaderFactory.java:777)
	at org.apache.hadoop.hdfs.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:694)
	at org.apache.hadoop.hdfs.BlockReaderFactory.build(BlockReaderFactory.java:355)
	at org.apache.hadoop.hdfs.DFSInputStream.blockSeekTo(DFSInputStream.java:656)
	at org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:882)
	at org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:934)
	at java.io.DataInputStream.read(DataInputStream.java:149)
	at org.apache.hadoop.mapreduce.lib.input.UncompressedSplitLineReader.fillBuffer(UncompressedSplitLineReader.java:62)
	at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:216)
	at org.apache.hadoop.util.LineReader.readLine(LineReader.java:174)
	at org.apache.hadoop.mapreduce.lib.input.UncompressedSplitLineReader.readLine(UncompressedSplitLineReader.java:94)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:144)
	at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:184)
	at org.apache.hadoop.mapred.MapTask$NewTrackingRecordReader.nextKeyValue(MapTask.java:556)
	at org.apache.hadoop.mapreduce.task.MapContextImpl.nextKeyValue(MapContextImpl.java:80)
	at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.nextKeyValue(WrappedMapper.java:91)
	at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145)
	at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:787)
	at org.apache.hadoop.mapred.MapTask.run(MapTask.java:341)
	at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:243)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

 错误原因:由于操作的主机和hdfs所在的master不是一个主机,所有namenode的地址映射出现问题,而且博主配置的是伪分布式。

解决方法:(博主配置的是伪分布式)把虚拟机的ip就加入到window本机的host文件里面(怎么改window的host参考百度)。

然后加入conf的配置。把namenode变成地址映射。把新建job时候加入conf

import org.apache.hadoop.conf.Configuration;
conf.set("dfs.client.use.datanode.hostname", "true"); @SuppressWarnings("deprecation") Job job =new Job(conf,"filter")

package mapreduce;  

import java.io.IOException;  

import java.util.StringTokenizer;  

import org.apache.hadoop.fs.Path;  

import org.apache.hadoop.io.IntWritable;  

import org.apache.hadoop.io.Text;  

import org.apache.hadoop.mapreduce.Job;  

import org.apache.hadoop.mapreduce.Mapper;  

import org.apache.hadoop.mapreduce.Reducer;  

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  

public class WordCount {  

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  

        Job job = Job.getInstance();  

        job.setJobName("WordCount");  

        job.setJarByClass(WordCount.class);  

        job.setMapperClass(doMapper.class);  

        job.setReducerClass(doReducer.class);  

        job.setOutputKeyClass(Text.class);  

        job.setOutputValueClass(IntWritable.class);  

        Path in = new Path("hdfs://localhost:9000/mymapreduce1/in/buyer_favorite1");  

        Path out = new Path("hdfs://localhost:9000/mymapreduce1/out");  

        FileInputFormat.addInputPath(job, in);  

        FileOutputFormat.setOutputPath(job, out);  

        System.exit(job.waitForCompletion(true) ? 0 : 1);  

    }  

    public static class doMapper extends Mapper<Object, Text, Text, IntWritable>{  

        public static final IntWritable one = new IntWritable(1);  

        public static Text word = new Text();  

        @Override  

        protected void map(Object key, Text value, Context context)  

                    throws IOException, InterruptedException {  

            StringTokenizer tokenizer = new StringTokenizer(value.toString(), " ");  

                word.set(tokenizer.nextToken());  

                context.write(word, one);  

        }  

    }  

    public static class doReducer extends Reducer<Text, IntWritable, Text, IntWritable>{  

        private IntWritable result = new IntWritable();  

        @Override  

        protected void reduce(Text key, Iterable<IntWritable> values, Context context)  

        throws IOException, InterruptedException {  

        int sum = 0;  

        for (IntWritable value : values) {  

        sum += value.get();  

        }  

        result.set(sum);  

        context.write(key, result);  

        }  

    }  

}  

https://necydcy.me/
原文地址:https://www.cnblogs.com/miria-486/p/9961945.html