Hadoop Word Count程序

Hadoop Word Count程序

pom.xml文件:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.stono</groupId>
    <artifactId>mr2</artifactId>
    <version>0.0.1-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>mr2</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <java.version>1.8</java.version>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss</maven.build.timestamp.format>
        
        <hadoop-mapreduce-client.version>2.7.2</hadoop-mapreduce-client.version>
        <hbase-client.version>1.1.2</hbase-client.version>
        <slf4j.version>1.7.25</slf4j.version>
        <kafka-client.version>0.10.2.1</kafka-client.version>
    </properties>
    
    <dependencies>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.8</version>
            <scope>system</scope>
            <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
        </dependency>
        <!-- 日志记录 Slf4j -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
        <!-- mapreduce -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop-mapreduce-client.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop-mapreduce-client.version}</version>
        </dependency>
        
        
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <artifactId> maven-assembly-plugin </artifactId>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                    <archive>
                        <manifest>
                            <mainClass>com.stono.basis.JobRunner</mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

map程序:

package com.test.basis;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/*
四个参数的含义
第一个参数:map中key-value的key的类型,默认值是输入行的偏移量
第二个参数:map中key-value的value的类型 在此需求中是某一行的内容(数据)
第三个参数:reduce中key-value中的key类型
第四个参数:redece的输出参数int
但是在mapreduce中涉及到了网络间的传输,所以需要序列化,而hadoop提供了相关的序列化类型
long-LongWritable
String-Text
int-IntWritable
 */


public class MapperWordCount extends Mapper<LongWritable, Text, Text, IntWritable>{
    
    /*重写mapper的map方法 编写自己的逻辑
     * key是偏移量不用管
     * value是一行的内容 例:hello zhangsan you you 
     * context是返回结果
     */
    @Override
    protected void map(LongWritable key, Text value,
            Context context)
            throws IOException, InterruptedException {
        
        String[] values=value.toString().split(" ");//对得到的一行数据进行切分 在此需求中是以空格进行切分
        
        for(String word:values){
            
            context.write(new Text(word), new IntWritable(1));//遍历数组 输出map的返回值 即<hello,1><zhangsan,1><you,1><you,1>
            
        }
        
    }
    

}

Reduce程序:

package com.test.basis;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

/*
 * 此方法是WordCount的reduce
 * 参数:1.map阶段返回的key类型String-Text
 *         2.map阶段返回值中value的类型Int-IntWritable
 *         3.reduce阶段返回值中key的类型String-Text
 *         4.reduce阶段返回值中value的类型Int-IntWritable
 */

public class ReducerWordCount extends Reducer<Text, IntWritable,Text, IntWritable>{
    
    
    /*
     * 实现父类的reduce方法
     *key是一组key-value的相同的哪个key
     *values是一组key-value的所有value
     *key value 的情况,比如<hello,{1,1,1,1,1}>
     * 
     * context 返回值,<hello,5>
     */
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values,
            Context context)throws IOException, InterruptedException {
        
            int count=0;//初始一个计数器
        
            for(IntWritable value:values){
                count ++;//对values进行遍历 每次加1
            }
            context.write(key,new IntWritable(count));//最后写返回值<hello,5>
            
            
        
    }
    
    
}

JobRunner程序:

package com.test.basis;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;



public class JobRunner {
    
    /*
     * 提交写好的mapreduce程序 当做一个Job进行提交
     * 
     */
    
    public static void main(String[] args) throws Exception {
        //读取classpath下的所有xxx-site.xml配置文件,并进行解析
        Configuration conf=new Configuration();

          FileSystem fs = FileSystem.get(conf);
          String s = "/user/weblogic/mr/output/";
          Path p = new Path(s);
          fs.delete(p,true); // 先把输出目录删了;否则重复运行会报错;

        
        Job wcjob=Job.getInstance(conf);//初始一个job
        
        //通过主类的类加载器机制获取到本job的所有代码所在的jar包
        wcjob.setJarByClass(JobRunner.class);
        
        //指定本job使用的mapper类
        wcjob.setMapperClass(MapperWordCount.class);
        
        //指定本job使用的reducer类
        wcjob.setReducerClass(ReducerWordCount.class);
        
        //指定mapper输出的kv的数据类型
        wcjob.setMapOutputKeyClass(Text.class);
        wcjob.setMapOutputValueClass(IntWritable.class);
        
        //指定reducer输出的kv数据类型
        wcjob.setOutputKeyClass(Text.class);
        wcjob.setOutputValueClass(IntWritable.class);
        
        //指定本job要处理的文件所在的路径
        FileInputFormat.setInputPaths(wcjob, new Path("/user/weblogic/mr/wordcount/"));
        
        //指定本job输出的结果文件放在哪个路径
        FileOutputFormat.setOutputPath(wcjob, new Path("/user/weblogic/mr/output/"));
        
        //将本job向hadoop集群提交执行
        boolean res=wcjob.waitForCompletion(true);
        
        System.exit(res?0:1);//执行成功的话正常退出系统执行有误则终止执行
    }

}

运行:直接mvn install,把jar文件上传到服务器,就可以hadoop jar test.jar了;

 使用maven生成的jar文件,有一个with-dependency的,那个可以直接运行,还有一个小一点的jar包,那里面没有包含主类信息,需要自己手动补充主类才可以。

在hadoop服务器环境下,其他的类信息是不缺少的。

 如果不进行手动的MANIFEST.MF文件修改,那么就在运行的时候添加上主类信息:hadoop jar mrbasis-s.jar com.stono.basis.JobRunner

简化的pom.xml文件:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.stono</groupId>
    <artifactId>mr01</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>jar</packaging>

    <name>mr01</name>
    <url>http://maven.apache.org</url>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <java.version>1.7</java.version>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss</maven.build.timestamp.format>

        <hadoop-mapreduce-client.version>2.7.2</hadoop-mapreduce-client.version>
        <hbase-client.version>1.1.2</hbase-client.version>
        <slf4j.version>1.7.25</slf4j.version>
        <kafka-client.version>0.10.2.1</kafka-client.version>
    </properties>


    <dependencies>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.8</version>
            <scope>system</scope>
            <systemPath>D:/Java/jdk1.8.0_161/lib/tools.jar</systemPath>
        </dependency>
        <!-- 日志记录 Slf4j -->
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>${slf4j.version}</version>
        </dependency>
        <!-- mapreduce -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop-mapreduce-client.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop-mapreduce-client.version}</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>3.8.1</version>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>false</addClasspath>
                            <mainClass>com.bsr.basis.JobRunner</mainClass> <!-- 你的主类名 -->
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
            <!--<plugin>-->
            <!--<artifactId> maven-assembly-plugin </artifactId>-->
            <!--<configuration>-->
            <!--<descriptorRefs>-->
            <!--<descriptorRef>jar-with-dependencies</descriptorRef>-->
            <!--</descriptorRefs>-->
            <!--<archive>-->
            <!--<manifest>-->
            <!--<mainClass>com.bsr.basis.JobRunner</mainClass>-->
            <!--</manifest>-->
            <!--</archive>-->
            <!--</configuration>-->
            <!--<executions>-->
            <!--<execution>-->
            <!--<id>make-assembly</id>-->
            <!--<phase>package</phase>-->
            <!--<goals>-->
            <!--<goal>single</goal>-->
            <!--</goals>-->
            <!--</execution>-->
            <!--</executions>-->
            <!--</plugin>-->
        </plugins>
    </build>

</project>
原文地址:https://www.cnblogs.com/stono/p/8295581.html