MapReduce-TextInputFormat 切片机制
MapReduce 默认使用 TextInputFormat 进行切片,其机制如下
(1)简单地按照文件的内容长度进行切片 (2)切片大小,默认等于Block大小,可单独设置 (3)切片时不考虑数据集整体,而是逐个针对每一个文件单独切片 例如: (1)输入数据有两个文件: filel.txt 320M file2.txt 10M (2)经过 FilelnputFormat(TextInputFormat为其实现类)的切片机制运算后,形成的切片信息如下: filel.txt.splitl--0~128 filel.txt.split2--128~256 filel.txt.split3--256~320 file2.txt.splitl--0~10M
测试读取数据的方式
输入数据(中间为空格,末尾为换行符)
map 阶段的 k-v
可以看出 k 为偏移量,v 为一行的值,即 TextInputFormat 按行读取
以 WordCount 为例进行测试,测试切片数
测试数据,三个相同的文件
测试代码
package com.mapreduce.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.log4j.BasicConfigurator; import java.io.IOException; import java.util.StringTokenizer; public class WordCount { static { try { // 设置 HADOOP_HOME 环境变量 System.setProperty("hadoop.home.dir", "D:/DevelopTools/hadoop-2.9.2/"); // 日志初始化 BasicConfigurator.configure(); // 加载库文件 System.load("D:/DevelopTools/hadoop-2.9.2/bin/hadoop.dll"); } catch (UnsatisfiedLinkError e) { System.err.println("Native code library failed to load.\n" + e); System.exit(1); } } public static void main(String[] args) throws Exception { args = new String[]{"D:\\tmp\\input2", "D:\\tmp\\456"}; Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 设置 InputFormat,默认为 TextInputFormat.class,这里显式设置下,后面设置切片大小 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setMinInputSplitSize(job, 1); TextInputFormat.setMaxInputSplitSize(job, 1024 * 1024 * 128); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); @Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { // 查看 k-v System.out.println(key + "\t" + value); StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } }