Hadoop MapReduce 上利用Lucene实现分布式索引--测试主类
该测试代码对应了之前的文章 Hadoop MapReduce 上利用Lucene实现分布式索引
之前在完成一项任务时,需要检索几十万个questionID,提取对应的内容。这不能用简单的顺序查找或者折半查找实现。所以我设计了QuestionIndexMR,主要目的是根据questionID快速提取其所对应的value值(这里的设计相当于使用文件名,将文件内容提取出来。但是如果做传统意义上的索引检索,则是反过来的^_^),所以需要区分理解。
QuestionIndexMR的源码如下:
package question.index; import hdfs.document.HDFSDocument; import hdfs.document.HDSDocumentOutput; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; public class QuestionIndexMR extends Configured implements Mapper<LongWritable, Text, Text, Text>, Reducer<Text, Text, Text, HDFSDocument>{ String charset = null; @Override public void configure(JobConf job) { // TODO Auto-generated method stub setConf(job); } @Override public void close() throws IOException { // TODO Auto-generated method stub } @Override public void map(LongWritable key, Text value, OutputCollector<Text, Text> collector, Reporter reporter) throws IOException { charset = getConf().get("charset"); /* value的格式为“questionID value1” */ String tempValue = new String(value.getBytes(),0,value.getLength(),charset); String[] splitResu = tempValue.split("\t"); Text questionID = new Text(splitResu[0]); collector.collect(questionID, new Text(splitResu[1])); } @Override public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, HDFSDocument> collector, Reporter reporter) throws IOException { while (values.hasNext()){ HashMap<String,String> fields = new HashMap<String, String>(); fields.put(key.toString(), values.next().toString()); HDFSDocument doc = new HDFSDocument(); doc.setFields(fields); collector.collect(key, doc); } } public void run() throws Exception{ String questionInput = "/user/zhl/question_category_keywords"; String questionOutput = "/user/zhl/question_luceneIndex"; Configuration conf = new Configuration(); conf.set("charset", "utf-8"); JobConf job = new JobConf(conf, QuestionIndexMR.class); job.setJarByClass(QuestionIndexMR.class); job.setJobName("ProblemIndexer"); FileInputFormat.addInputPath(job, new Path(questionInput)); Path outpath= new Path(questionOutput); FileSystem fs = FileSystem.get(conf); if(fs.exists(outpath)) fs.delete(outpath, true); FileOutputFormat.setOutputPath(job, outpath); job.setMapperClass(QuestionIndexMR.class); job.setReducerClass(QuestionIndexMR.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HDFSDocument.class); job.setOutputFormat(HDSDocumentOutput.class); job.setNumMapTasks(45); job.setNumReduceTasks(1); JobClient.runJob(job); } }
这是最初的解决方法。后来发现随着索引内容的增多,检索的速度下降的非常快。
最后的解决方案是,采用符合MapReduce流式原理的做法,在需要访问questionID内容的时候,将questionID对应的内容输入,并在map/reduce阶段进行控制。