IDEA自定义OutputFormat案例实操

2020-05-23 hadoop 0 评论

需求:
过滤输入的log日志，包含atguigu的网站输出到e:/atguigu.log，不包含atguigu的网站输出到e:/other.log。

需求分析：
1.创建类FileRecordWriter继承RecordWriter
2.创建两个输出流,分别对应两个输出文件

MyRecordWriter类：

package com.atguigu.outputformat;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.FileNotFoundException;
import java.io.IOException;


public class MyRecordWriter extends RecordWriter<LongWritable, Text> {
    private FSDataOutputStream atguigu;
    private FSDataOutputStream other;

    /**
     * 利用框架信息确定输出文件具体存放位置
     * @param taskAttemptContext job框架的配置信息context
     * @throws IOException
     */
    public void initialize(TaskAttemptContext taskAttemptContext) throws IOException {
        String outdir = taskAttemptContext.getConfiguration().get(FileOutputFormat.OUTDIR);//固定写法，获取框架的输出文件夹
        FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());//固定写法，获取文件系统
        atguigu = fileSystem.create(new Path(outdir + "/atguigu.log"));
        other = fileSystem.create(new Path(outdir + "/other.log"));
    }

    /**
     * 将KV写出，每对KV调用一次
     * @param longWritable
     * @param text
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public void write(LongWritable longWritable, Text text) throws IOException, InterruptedException {
        String out = text.toString() + "\n";
        if(out.contains("atguigu")){
            atguigu.write(out.getBytes());
        }else{
            other.write(out.getBytes());
        }
    }

    /**
     * 关闭资源
     * @param taskAttemptContext
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        IOUtils.closeStream(atguigu);
        IOUtils.closeStream(other);
    }
}

MyOutputFormat类：

package com.atguigu.outputformat;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class MyOutputFormat extends FileOutputFormat<LongWritable, Text> {
    @Override
    public RecordWriter<LongWritable, Text> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        MyRecordWriter myRecordWriter = new MyRecordWriter();
        myRecordWriter.initialize(taskAttemptContext);//将context框架信息传递给输出流
        return myRecordWriter;
    }
}

OutputDriver类：

package com.atguigu.outputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class OutputDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(OutputDriver.class);

        job.setOutputFormatClass(MyOutputFormat.class);
        FileInputFormat.setInputPaths(job,new Path("F:\\input"));
        FileOutputFormat.setOutputPath(job,new Path("F:\\output"));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

本文链接： https://www.fluffysponge.fun/2020/05/23/IDEA%E8%87%AA%E5%AE%9A%E4%B9%89OutputFormat%E6%A1%88%E4%BE%8B%E5%AE%9E%E6%93%8D/

版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

InstantCWeedStudent

个人简介。