IDEA自定义InputFormat案例实操

需求:
将多个小文件合并成一个SequenceFile文件(SequenceFile文件是Hadoop用来存储二进制形式的key-value对的文件格式),SequenceFile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value。

1.自定义一个类继承FileInputFormat。重写isSplitable()方法。重写createRecordReader()方法,自定义RecordReader对象,并初始化
2.改写RecordReader,实现一次读取一个完整文件封装为KV值
3.设置Driver中job的setInputFormatClass和setOutputFormatClass

WholeFileInputFormat类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
package com.atguigu.inputformat;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

public class WholeFileInputFormat extends FileInputFormat<Text, BytesWritable> {
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}

public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
return new WholeFileRecordReader();
}
}

WholeFileRecordReader类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package com.atguigu.inputformat;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
* 自定义RR,处理一个文件:把这个文件直接读成一个KV值
*/
public class WholeFileRecordReader extends RecordReader<Text, BytesWritable> {

private boolean notRead = true;

private Text key = new Text();
private BytesWritable value = new BytesWritable();
private FSDataInputStream inputStream;
private FileSplit fs;


/**
* 初始化方法,框架会在开始的时候调用一次
* @param inputSplit
* @param taskAttemptContext
* @throws IOException
* @throws InterruptedException
*/
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//套路化操作
//inputSplit就是当前的切片,taskAttemptContext就是当前这个job的信息

//转换切片到文件切片
fs = (FileSplit) inputSplit;
//通过切片获取路径
Path path = fs.getPath();
//通过路径获取文件系统
FileSystem fileSystem = path.getFileSystem(taskAttemptContext.getConfiguration());
//开流
inputStream = fileSystem.open(path);
}

/**
* 读取下一组KV值
* @return 如果读到了,返回true;读完了,返回false
* @throws IOException
* @throws InterruptedException
*/
public boolean nextKeyValue() throws IOException, InterruptedException {
if(notRead){
//具体读文件的过程
//读Key
key.set(fs.getPath().toString());

//读Value
byte[] buf = new byte[(int) fs.getLength()];//长度和文件一样长
inputStream.read(buf);
value.set(buf,0,buf.length);

notRead = false;
return true;
}else{
return false;
}

}

/**
* 获取当前读到的Key
* @return 当前的Key
* @throws IOException
* @throws InterruptedException
*/
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}

/**
* 获取当前读到的Value
* @return 当前的Value
* @throws IOException
* @throws InterruptedException
*/
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}

/**
* 当前数据读取的进度
* @return 当前进度
* @throws IOException
* @throws InterruptedException
*/
public float getProgress() throws IOException, InterruptedException {
return notRead ? 0 : 1;
}

/**
* 关闭资源
* @throws IOException
*/
public void close() throws IOException {
//套路化关流
IOUtils.closeStream(inputStream);
}
}

WholeFileDriver类:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
package com.atguigu.inputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

public class WholeFileDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());

job.setJarByClass(WholeFileDriver.class);

job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);

job.setInputFormatClass(WholeFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);

FileInputFormat.setInputPaths(job,new Path("f:/input"));
FileOutputFormat.setOutputPath(job,new Path("f:/output"));

boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}