Reducejoin sample
示例文件同sample join analysis
之前的示例是使用map端的join.这次使用reduce端的join.
根据源的类别写不同的mapper,处理不同的文件,输出的key都是studentno.value是其他的信息同时加上类别信息。
然后使用multipleinputs不同的路径注册不同的mapper.
reduce端相同的studentno的学生信息和考试成绩分配给同一个reduce,而且value中包含了这些信息,
把这些信息抽取出来,再做笛卡尔积即可。
下面的示例代码中,我没有使用multipleinputs来处理,自己修改了TextInputFormat的一些信息,使用返回文件名和当前行的信息。
根据文件名我在mapper中处理两个不同文件的信息,加上不同的类别送出去。
下面的代码中还有很多可以优化的地方,以后再更新。
package myexamples; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.LineReader; public class reducejoin { public static class MyTextInputFormat extends FileInputFormat<Text, Text> { @Override public MyLineRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) { return new MyLineRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory( context.getConfiguration()).getCodec(file); return codec == null; } } public static class MyLineRecordReader extends RecordReader<Text, Text> { private static final Log LOG = LogFactory .getLog(LineRecordReader.class); private CompressionCodecFactory compressionCodecs = null; private long start; private long pos; private long end; private LineReader in; private int maxLineLength; private Text key = null; private Text value = null; Text filename = null; public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt( "mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); key = new Text(file.getName()); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; } public boolean nextKeyValue() throws IOException { if (key == null) { } if (value == null) { value = new Text(); } int newSize = 0; while (pos < end) { newSize = in.readLine(value, maxLineLength, Math.max( (int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); if (newSize == 0) { break; } pos += newSize; if (newSize < maxLineLength) { break; } // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } if (newSize == 0) { key = null; value = null; return false; } else { return true; } } @Override public Text getCurrentKey() { return key; } @Override public Text getCurrentValue() { return value; } /** * Get the progress within the split */ public float getProgress() { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (pos - start) / (float) (end - start)); } } public synchronized void close() throws IOException { if (in != null) { in.close(); } } } public static class studentMapper extends Mapper<Text, Text, Text, Text> { public void map(Text key, Text value, Context context) throws IOException, InterruptedException { Text newvalue = null; String strv = value.toString().substring( value.toString().indexOf(",")); if (key.toString().contains("student")) // student file newvalue = new Text("student" + strv); else newvalue = new Text("score" + strv); Text newkey = new Text(value.toString().substring(0, value.toString().indexOf(","))); context.write(newkey, newvalue); } } public static class studentReducer extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { List<String> students = new ArrayList<String>(); List<String> scores = new ArrayList<String>(); for (Text value : values) if (value.toString().startsWith("student")) students.add(value.toString().substring(8)); else scores.add(value.toString().substring(6)); // split real results for (String student : students) for (String score : scores) context.write(key, new Text(student + "," + score)); } } public static void main(String[] args) throws Exception { args = "hdfs://namenode:9000/user/hadoop/student/ hdfs://namenode:9000/user/hadoop/reducejoinout" .split(" "); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } myUtils.myUtils.DeleteFolder(conf, otherArgs[1]); conf.set("io.sort.mb", "10"); Job job = new Job(conf, "reduce join"); job.setInputFormatClass(MyTextInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(reducejoin.class); job.setMapperClass(studentMapper.class); job.setReducerClass(studentReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
Looking for a job working at Home about MSBI