IDEA手写示例程序wordcount并打包运行

2020-05-18 hadoop 0 评论

程序主要需要三个类：WcDriver、WcMapper、WcReducer

settings.xml:

<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0
                          https://maven.apache.org/xsd/settings-1.0.0.xsd">
      <mirrors>
	  <!-- 阿里云仓库 -->
        <mirror>  
            <id>alimaven</id>  
            <name>aliyun maven</name>  
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>  
            <mirrorOf>central</mirrorOf>          
        </mirror>  
		
      </mirrors>
</settings>

pom.xml:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu</groupId>
    <artifactId>mapreduce1205</artifactId>
    <version>1.0-SNAPSHOT</version>



    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.8.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.2</version>
        </dependency>
        <dependency>
            <groupId>jdk.tools</groupId>
            <artifactId>jdk.tools</artifactId>
            <version>1.8</version>
            <scope>system</scope>
            <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
        </dependency>
    </dependencies>
</project>

WcDriver:

package com.atguigu.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WcDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //套路化编写
        //1.获取一个Job实例,类似整个MR这条线Context
        Job job = Job.getInstance(new Configuration());

        //2.设置我们的类路径(Classpath)
        job.setJarByClass(WcDriver.class);

        //3.设置Mapper和Reducer
        job.setMapperClass(WcMapper.class);
        job.setReducerClass(WcReducer.class);

        //4.设置Mapper和Reducer输出的类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //5.设置输入输出数据
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //6.提交我们的Job
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

WcMapper:

package com.atguigu.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WcMapper extends Mapper<LongWritable, Text,Text, IntWritable> {
    /**
     * LongWritable:框架输入类型的Key，这行在行首在文件中的偏移量
     * Text:框架输入类型的Value，这行的内容
     * Text:Mapper的输入类型Key
     * IntWritable:Mapper的输入类型Value
     */
    private Text word = new Text();
    private IntWritable one = new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //拿到一行数据
        String line = value.toString();

        //按照空格切分数据
        String[] words = line.split(" ");

        //遍历数组，把单词变成(word,1)形式交给框架
        for (String word : words) {
            //context.write(new Text(word),new IntWritable(1));//大量生成对象，造成运行变慢
            this.word.set(word);
            context.write(this.word,this.one);
        }
    }
}

WcReducer:

package com.atguigu.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WcReducer extends Reducer<Text, IntWritable,Text,IntWritable> {
    private IntWritable total  = new IntWritable();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //做累加
        int sum = 0;
        for (IntWritable value : values) {
            sum += value.get();
        }
        //包装结果并输出
        total.set(sum);
        context.write(key,total);
    }

}

运行时还需要在IDEA右上角附近选择WcDriver的Edit Configurations,填写其中的Program arguments两个命令行参数——输入文件地址和输出文件地址

注意：
打包过程在IDEA操作窗口最右边：Maven选项选择——Lifecycle——package即可自动打成jar包，并生成在项目的target文件夹中，之后可放入集群环境测试运行

本文链接： https://www.fluffysponge.fun/2020/05/18/IDEA%E6%89%8B%E5%86%99%E7%A4%BA%E4%BE%8B%E7%A8%8B%E5%BA%8Fwordcount%E5%B9%B6%E6%89%93%E5%8C%85%E8%BF%90%E8%A1%8C/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

InstantCWeedStudent

个人简介。