MapReduce案例五:倒排索引

一、数据样例

三个文件,a.txt,b.txt,c.txt。其中每个文件中包含若干的单词。

文件a.txt内容:

I Love Hadoop
he like ZhouSiYuan
I love me

文件b.txt内容:

I Love MapReduce
he like NBA
I love Hadoop

文件c.txt内容:

I Love MapReduce
I love me
I Love Hadoop

二、需求

  • 建立搜索索引,根据查找单词来查找文档。

三、分析

  • 1、求出每个文件中对应的单词及其单词次数,并在其后面加上其对应的文件名。即形如:I--a.txt 2I--b.txt 2

  • 2、最后得出单词所对应所有文件名,及其在每个文件中出现的次数。即形如:**I a.txt-->2 b.txt-->2 c.txt-->3 **。

四、代码实现

  • 1、第一次Mapper,编写 OneIndexMapper 类:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1 获取切片名称
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        String name = inputSplit.getPath().getName();

        // 2 获取1行
        String line = value.toString();

        // 3 截取
        String[] words = line.split(" ");

        // 4 把每个单词和切片名称关联起来
        for (String word : words) {
            k.set(word + "--" + name);
            
            context.write(k, new IntWritable(1));
        }
    }
}
  • 2、第一次Reducer,编写 OneIndexReducer 类:
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
    
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        
        int count = 0;
        // 累加和
        for(IntWritable value: values){
            count +=value.get();
        }
        
        // 写出
        context.write(key, new IntWritable(count));
    }
}

  • 3、第一次Driver,编写 OneIndexDriver 类:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class OneIndexDriver {

    public static void main(String[] args) throws Exception {

        //data文件加下包含a.txt,b.txt,c.txt三个文件
        args = new String[]{"D:\\大数据API\\data","D:\\大数据API\\data1"};

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(OneIndexDriver.class);

        job.setMapperClass(OneIndexMapper.class);
        job.setReducerClass(OneIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        job.waitForCompletion(true);
    }
}
  • 4、第二次Mapper,编写 TwoIndexMapper 类:
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text>{
    Text k = new Text();
    Text v = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        
        // 1 获取1行数据
        String line = value.toString();
        
        // 2用“--”切割
        String[] fields = line.split("--");
        
        k.set(fields[0]);
        v.set(fields[1]);
        
        // 3 输出数据
        context.write(k, v);
    }
}
  • 5、第二次Mapper,编写 TwoIndexReducer类:
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //  I--a.txt	2
        //I--b.txt	2
        //I--c.txt	3

        //变成:I c.txt-->2 b.txt-->2 a.txt-->3

        StringBuilder sb = new StringBuilder();

        for (Text value : values) {
            sb.append(value.toString().replace("\t", "-->") + "\t");
        }
        
        context.write(key, new Text(sb.toString()));
    }
}
  • 6、第二次Mapper,编写 TwoIndexDriver 类:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TwoIndexDriver {

    public static void main(String[] args) throws Exception {

        args = new String[]{"D:\\大数据API\\data1","D:\\大数据API\\data2"};

        Configuration config = new Configuration();
        Job job = Job.getInstance(config);

        job.setMapperClass(TwoIndexMapper.class);
        job.setReducerClass(TwoIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
  • 结果图

第一次MapReduce:

第二次MapReduce:

posted @ 2020-02-05 17:44  落花桂  阅读(541)  评论(0编辑  收藏  举报
返回顶端
Live2D