InputFormat 按文件来划分
1 package seven.ili; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.BlockLocation; 5 import org.apache.hadoop.fs.FileStatus; 6 import org.apache.hadoop.fs.FileSystem; 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.IntWritable; 9 import org.apache.hadoop.io.LongWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.mapreduce.*; 12 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 13 import org.apache.hadoop.mapreduce.lib.input.FileSplit; 14 import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 import org.apache.hadoop.util.GenericOptionsParser; 17 import org.eclipse.core.internal.resources.FileState; 18 19 import java.io.IOException; 20 import java.util.ArrayList; 21 import java.util.List; 22 import java.util.StringTokenizer; 23 24 /** 25 * Created with IntelliJ IDEA. 26 * User: Isaac Li 27 * Date: 12/17/12 28 * Time: 4:00 PM 29 * To change this template use File | Settings | File Templates. 30 */ 31 public class Test { 32 33 public static class TokenizerMapper 34 extends Mapper<Object, Text, Text, IntWritable> { 35 private final static IntWritable one = new IntWritable(1); 36 private Text word = new Text(); 37 public void map(Object key, Text value, Context context 38 ) throws IOException, InterruptedException { 39 FileSplit fileSplit = (FileSplit)context.getInputSplit(); 40 String filename = fileSplit.getPath().getName(); 41 System.out.println("File name "+filename); 42 System.out.println("Directory and File name"+fileSplit.getPath().toString()); 43 44 StringTokenizer itr = new StringTokenizer(value.toString()); 45 while (itr.hasMoreTokens()) { 46 word.set(itr.nextToken()); 47 context.write(word, one); 48 } 49 } 50 } 51 52 public static class IntSumReducer 53 extends Reducer<Text,IntWritable,Text,IntWritable> { 54 private IntWritable result = new IntWritable(); 55 56 public void reduce(Text key, Iterable<IntWritable> values, 57 Context context 58 ) throws IOException, InterruptedException { 59 int sum = 0; 60 for (IntWritable val : values) { 61 sum += val.get(); 62 } 63 result.set(sum); 64 context.write(key, result); 65 } 66 } 67 68 public static class SimpleTextFileInputFormat extends FileInputFormat<LongWritable, Text>{ 69 public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context){ 70 return new LineRecordReader(); 71 } 72 public List<InputSplit> getSplits(JobContext job) throws IOException { 73 List<InputSplit> splits = new ArrayList<InputSplit>(); 74 for (FileStatus file: listStatus(job)){ 75 Path path = file.getPath(); 76 FileSystem fs = path.getFileSystem(job.getConfiguration()); 77 long length = file.getLen(); 78 BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); 79 if (length != 0){ 80 splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); 81 } 82 } 83 return splits; 84 } 85 } 86 87 public static void main(String[] args) throws Exception { 88 Configuration conf = new Configuration(); 89 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 90 if (otherArgs.length < 2) { 91 System.err.println("Usage: wordcount <in> <out>"); 92 System.exit(2); 93 } 94 Job job = new Job(conf, "word count3"); 95 job.setJarByClass(Test.class); 96 job.setInputFormatClass(SimpleTextFileInputFormat.class); 97 job.setMapperClass(TokenizerMapper.class); 98 //job.setCombinerClass(IntSumReducer.class); 99 job.setReducerClass(IntSumReducer.class); 100 job.setOutputKeyClass(Text.class); 101 job.setOutputValueClass(IntWritable.class); 102 job.setNumReduceTasks(0); 103 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 104 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 105 System.exit(job.waitForCompletion(true) ? 0 : 1); 106 } 107 }
主要 实现在 SimpleTextFileInputFormat 这个类中 !