Hadoop下进行反向索引(Inverted Index)操作
0.参考资料:
代码参考1:http://www.pudn.com/downloads212/sourcecode/unix_linux/detail999273.html
理论参考2:http://zhangyu8374.javaeye.com/blog/86307,http://nything.javaeye.com/blog/411787
1.分析
假如有file0,file1,file2三个文件,这些文件中都保存了一些文本内容,比如在file0中只有一个句子,内容为"we are happy"。一般的索引都是记录在这个文件中没有一个单词的索引号。比如file0的索引可以是(we,0),(are,1),(happy,2)。这样的键值对中key是单词,value是这个单词在这个文件中的位置。但是,反向索引刚好相反,对应于多个文件,我们要求出某一个单词在所有这些文件中出现的位置。我们可以按如下操作进行实验:
在本地创建文件夹IndexTest并在里面创建3个文件,每个文件中的内容如下。
* T0 = "it is what it is"
* T1 = "what is it"
* T2 = "it is a banana"
其中T0,T1,T2分别是文件名,后面为文件内容。将IndexTest文件夹上传到DFS中。然后运行反向索引程序。反向索引程序见代码示例。
最后输出结果为:
a (T2, 3)
banana (T2, 4)
is (T2, 2) (T0, 2) (T0, 5) (T1, 2)
it (T1, 3) (T2, 1) (T0, 1) (T0, 4)
what (T0, 3) (T1, 1)
2.代码示例
InvertedIndex.java
View Code
/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package pa4; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; /** * * @author Ming */ public class InvertedIndex { public static class TokenizerMapper extends Mapper<Text, ValuePair, Text, ValuePair> { @Override public void map(Text key, ValuePair value, Context context) throws IOException, InterruptedException { // TokenInputFormat has generate (word, (fileID, wordPosition)) // so mapper just spill it to reducer key.set(key.toString().toLowerCase()); context.write(key, value); } } public static class IndexReducer extends Reducer<Text, ValuePair, Text, Text> { private Text postings = new Text(); @Override public void reduce(Text key, Iterable<ValuePair> values, Context context) throws IOException, InterruptedException { String list = ""; for (ValuePair val : values) { list += " " + val.toString(); } postings.set(list); context.write(key, postings); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: InvertedIndex <in-dir> <out-dir>"); System.exit(2); } // remove the old output dir FileSystem.get(conf).delete(new Path(otherArgs[1]), true); Job job = new Job(conf, "Inverted Indexer"); job.setJarByClass(InvertedIndex.class); job.setInputFormatClass(TokenInputFormat.class); job.setMapperClass(InvertedIndex.TokenizerMapper.class); //job.setCombinerClass(InvertedIndex.IndexReducer.class); job.setReducerClass(InvertedIndex.IndexReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ValuePair.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
TokenInputFormat.java
View Code
package pa4; import java.io.IOException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.util.LineReader; import java.util.StringTokenizer; public class TokenInputFormat extends FileInputFormat<Text, ValuePair> { /** * Don't allow the files to be split! */ @Override protected boolean isSplitable(JobContext ctx, Path filename) { // ensure the input files are not splittable! return false; } /** * Just return the record reader * key is the docno */ public RecordReader<Text, ValuePair> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws IOException, InterruptedException { return new TokenRecordReader(); } public static class TokenRecordReader extends RecordReader<Text, ValuePair> { private long start; private long pos; private long end; private LineReader in; private int maxLineLength; private Text line; private Text key = null; private ValuePair value = null; private StringTokenizer tokens = null; private int tokenPos = 0; private String fileID = "0"; // input file id that appears in inverted index public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // Assume file name is an integer of file ID fileID = file.getName(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); in = new LineReader(fileIn, job); this.pos = start; line = new Text(); key = new Text(); value = new ValuePair(); } public boolean nextKeyValue() throws IOException { boolean splitEnds = false; while (tokens == null || !tokens.hasMoreTokens()) { int lineSize = in.readLine(line, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); if (lineSize == 0) { splitEnds = true; break; } pos += lineSize; tokens = new StringTokenizer(line.toString(), " /t/n/r/f,.;<>-?///!'/":=*{}()$[]"); } if (splitEnds) { key = null; value = null; line = null; tokens = null; return false; } else return true; } @Override public Text getCurrentKey() { key.set(tokens.nextToken()); tokenPos ++; return key; } @Override public ValuePair getCurrentValue() { value.set(fileID, tokenPos); return value; } /** * Get the progress within the split */ public float getProgress() { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (pos - start) / (float) (end - start)); } } public synchronized void close() throws IOException { if (in != null) { in.close(); } } } public static void main(String[] args) throws IOException { String fn = args[0]; Configuration conf = new Configuration(); FileSplit split = new FileSplit(new Path(fn), 0, 10000000, null); TokenRecordReader irr = new TokenRecordReader(); TaskAttemptContext ctx = new TaskAttemptContext(conf, new TaskAttemptID("hello", 12, true, 12, 12)); irr.initialize(split, ctx); while (irr.nextKeyValue()) { System.out.println(irr.getCurrentKey() + ": " + irr.getCurrentValue()); } } }
ValuePair.java
View Code
package pa4; /* * To change this template, choose Tools | Templates * and open the template in the editor. */ import java.io.*; import org.apache.hadoop.io.*; /** * * @author Ming */ public class ValuePair implements WritableComparable<ValuePair> { private Text one; private IntWritable two; public void set(Text first, IntWritable second) { one = first; two = second; } public void set(String first, int second) { one.set(first); two.set(second); } public ValuePair() { set(new Text(), new IntWritable()); } public ValuePair(Text first, IntWritable second) { set(first, second); } public ValuePair(String first, int second) { set(first, second); } public Text getFirst() { return one; } public IntWritable getSecond() { return two; } @Override public void write(DataOutput out) throws IOException { one.write(out); two.write(out); } @Override public void readFields(DataInput in) throws IOException { one.readFields(in); two.readFields(in); } @Override public int hashCode() { return one.hashCode(); } @Override public boolean equals(Object o) { if (o instanceof ValuePair) { ValuePair tp = (ValuePair)o; return one.equals(tp.one); } return false; } @Override public String toString() { return "(" + one + ", " + two + ")"; } @Override public int compareTo(ValuePair tp) { int cmp = one.compareTo(tp.one); if (cmp != 0) { return cmp; } return two.compareTo(tp.two); } public static class Comparator extends WritableComparator { private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator(); private static final IntWritable.Comparator INT_COMPARATOR = new IntWritable.Comparator(); public Comparator() { super(ValuePair.class); } @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { try { int oneL1 = WritableUtils.decodeVIntSize(b1[s1]) + readVInt(b1, s1); int oneL2 = WritableUtils.decodeVIntSize(b2[s2]) + readVInt(b2, s2); int cmp = TEXT_COMPARATOR.compare(b1, s1, oneL1, b2, s2, oneL2); if (cmp != 0) { return cmp; } return INT_COMPARATOR.compare(b1, s1+oneL1, l1-oneL1, b2, s2+oneL2, l2-oneL2); } catch (IOException e) { throw new IllegalArgumentException(e); } } @Override public int compare(WritableComparable a, WritableComparable b) { if (a instanceof ValuePair && b instanceof ValuePair) { return ((ValuePair) a).compareTo((ValuePair) b); } return super.compare(a, b); } } static { WritableComparator.define(ValuePair.class, new Comparator()); } }
ps:2012-5-20
这里键值对valuepair的运用让我想到了前几天写的Hashmap实现原理。在hashmap的实现过程中,也运用了键值对类Entry。 两者之间有共通之处,有空可以再改进Hashmap实现原理。