自定义InputFormat合并小文件
1.1 需求
无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案
1.2 分析
小文件的优化无非以下几种方式:
1、 在数据采集的时候,就将小文件或小批数据合成大文件再上传HDFS
2、 在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
3、 在mapreduce处理时,可采用combineInputFormat提高效率
1.3 实现
本节实现的是上述第二种方式
程序的核心机制:
自定义一个InputFormat
改写RecordReader,实现一次读取一个完整文件封装为KV
在输出时使用SequenceFileOutPutFormat输出合并文件
代码如下:
自定义InputFromat
public class MyInputFormat extends FileInputFormat<NullWritable,BytesWritable> {
@Override
public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//1:创建自定义RecordReader对象
MyRecordReader myRecordReader = new MyRecordReader();
//2:将inputSplit和context对象传给MyRecordReader
myRecordReader.initialize(inputSplit, taskAttemptContext);
return myRecordReader;
}
/*
设置文件是否可以被切割
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
自定义RecordReader
public class MyRecordReader extends RecordReader<NullWritable,BytesWritable>{
private Configuration configuration = null;
private FileSplit fileSplit = null;
private boolean processed = false;
private BytesWritable bytesWritable = new BytesWritable();
private FileSystem fileSystem = null;
private FSDataInputStream inputStream = null;
//进行初始化工作
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//获取文件的切片
fileSplit= (FileSplit)inputSplit;
//获取Configuration对象
configuration = taskAttemptContext.getConfiguration();
}
//该方法用于获取K1和V1
/*
K1: NullWritable
V1: BytesWritable
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!processed){
//1:获取源文件的字节输入流
//1.1 获取源文件的文件系统 (FileSystem)
fileSystem = FileSystem.get(configuration);
//1.2 通过FileSystem获取文件字节输入流
inputStream = fileSystem.open(fileSplit.getPath());
//2:读取源文件数据到普通的字节数组(byte[])
byte[] bytes = new byte[(int) fileSplit.getLength()];
IOUtils.readFully(inputStream, bytes, 0, (int)fileSplit.getLength());
//3:将字节数组中数据封装到BytesWritable ,得到v1
bytesWritable.set(bytes, 0, (int)fileSplit.getLength());
processed = true;
return true;
}
return false;
}
//返回K1
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
//返回V1
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return bytesWritable;
}
//获取文件读取的进度
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
//进行资源释放
@Override
public void close() throws IOException {
inputStream.close();
fileSystem.close();
}
}
Mapper类:
public class SequenceFileMapper extends Mapper<NullWritable,BytesWritable,Text,BytesWritable> {
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
//1:获取文件的名字,作为K2
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
//2:将K2和V2写入上下文中
context.write(new Text(fileName), value);
}
}
主类:
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//1:获取job对象
Job job = Job.getInstance(super.getConf(), "sequence_file_job");
//2:设置job任务
//第一步:设置输入类和输入的路径
job.setInputFormatClass(MyInputFormat.class);
MyInputFormat.addInputPath(job, new Path("file:///D:\\input\\myInputformat_input"));
//第二步:设置Mapper类和数据类型
job.setMapperClass(SequenceFileMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
//第七步: 不需要设置Reducer类,但是必须设置数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
//第八步:设置输出类和输出的路径
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job, new Path("file:///D:\\out\\myinputformat_out"));
//3:等待job任务执行结束
boolean bl = job.waitForCompletion(true);
return bl ? 0 : 1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration();
int run = ToolRunner.run(configuration, new JobMain(), args);
System.exit(run);
}
}