hadoop wordcount

使用java写出wordcount

1.创建项目

加入依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.xiaodao</groupId>
    <artifactId>hadoop001</artifactId>
    <version>1.0</version>

    <properties>
        <hadoop.version>2.6.0</hadoop.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>commons-cli</groupId>
            <artifactId>commons-cli</artifactId>
            <version>1.2</version>
        </dependency>
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <!-- 3.1.2 -->
        <!--        <dependency>-->
        <!--            <groupId>org.apache.hadoop</groupId>-->
        <!--            <artifactId>hadoop-hdfs-client</artifactId>-->
        <!--            <version>2.8.0</version>-->
        <!--        </dependency>-->

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.3</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-app</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-hs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>


        <!--        <dependency>-->
        <!--            <groupId>org.slf4j</groupId>-->
        <!--            <artifactId>slf4j-api</artifactId>-->
        <!--            <version>1.7.25</version>-->
        <!--        </dependency>-->
        <!--        <dependency>-->
        <!--            <groupId>log4j</groupId>-->
        <!--            <artifactId>log4j</artifactId>-->
        <!--            <version>1.2.17</version>-->
        <!--        </dependency>-->
    </dependencies>


</project>

View Code

在resouce文件夹下:

启动log文件的配置:

log4j.rootLogger=DEBUG,console,FILE

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.threshold=INFO
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%5p] - %c -%F(%L) -%m%n

log4j.appender.FILE=org.apache.log4j.RollingFileAppender
log4j.appender.FILE.Append=true
log4j.appender.FILE.File=logs/log4jtest.log
log4j.appender.FILE.Threshold=INFO
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%5p] - %c -%F(%L) -%m%n
log4j.appender.FILE.MaxFileSize=10MB

mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>local</value>
    </property>
</configuration>

剩下的配置文件.就是你集群中的配置文件copy进来即可.

mapper代码

public class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split(" ");
        for (String s : split) {
            context.write(new Text(s),new IntWritable(1));
        }
    }
}

reducer代码

public class WordCountMap extends Mapper<LongWritable, Text,Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] split = value.toString().split(" ");
        for (String s : split) {
            context.write(new Text(s),new IntWritable(1));
        }
    }
}

main 方法:

public class WordCountMain {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1,一个输入路径 2.一个输出路径
        if(args.length !=2 || args ==null){
            System.out.println("路径为空");
            System.exit(0);
        }
        Configuration configuration = new Configuration();
        //调用getinstance 生成job 方法

        Job job = Job.getInstance(configuration, WordCountMain.class.getSimpleName());
        //打jar
        job.setJarByClass(WordCountMain.class);

        //1设置默认格式.默认就是这个格式 InputFormat 可以传入一些子类
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        //2 设置输入输出路径

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        // 3  设置map和reduce类
        job.setMapperClass(WordCountMap.class);
        job.setReducerClass(WordCountReduce.class);

//        job.setCombinerClass(WordCountReduce.class);
            //如果map reduce 输入的个是一致,这里可以不用写
//        job.setMapOutputKeyClass(Text.class);
//           job.setOutputValueClass(IntWritable.class);
        //设置 reduce task的输出key/value格式
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //提供作业
        job.waitForCompletion(true);
    }
}

参数

出入文件

nancy 22 8000
ketty 22 9000
stone 19 10000
green 19 10000
white 30 29000
socrates 29 40000

输入和输出都在hdfs上.所以我们的参数为

hdfs://xiaodao:9000/salary.txt hdfs://xiaodao:9000/0905wordcount

hadoop jar /Users/xuyuanfang/IdeaProjects/hadoop001/target/hadoop001-1.0.jar com.xiaodao.wordcount.WordCountMain hdfs://xiaodao:9000/salary.txt hdfs://xiaodao:9000/0905wordcount2

运行之后就可以了.