Hadoop基础---倒排索引实现

一:实验说明

用于统计各个单词在各个文件中出现的次数,并按序输出

(一)实验数据

a.txt

hello kitty flink
hello tom spark with
hello mark spark
hadoop hadoop hadoop

b.txt

hello tom tom hadoop
tom is playing with mark
flink vs spark to hadoop
hadoop

c.txt

kitty want to learn hadoop
hadoop spark flink
cuda hello vs
hello vs flink

(二)实验结果

举例单词:Hello

在各个文件出现次数:

hello--->a.txt  3
hello--->b.txt  1
hello--->c.txt  2

结果输出形式:

hello   a.txt--->3 c.txt--->2 b.txt--->1

(三)实验思路

首先统计原始数据,输出单词--->单词所在文件 单词在该文件出现次数,例如hello--->a.txt  3

然后对文件出现次数进行排序,统一输出,例如:hello   a.txt--->3 c.txt--->2 b.txt--->1

二:代码实现

(一)统计原始数据,输出单词--->单词所在文件 单词在该文件出现次数

package cn.hadoop.ri;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReverseIndex {

    public static class ReverseIndexMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
                throws IOException, InterruptedException {
            //先读取一行
            String line = value.toString();
            //进行字符切割
            String[] fields = StringUtils.split(line, " ");
            //获取这一行内容所在的切片信息
            FileSplit inputSplit = (FileSplit)context.getInputSplit();
            //获取文件名
            String file = inputSplit.getPath().getName();
            //进行输出
            for(String word:fields) {
                context.write(new Text(word+"--->"+file), new LongWritable(1));
            }
        }
    }
    
    public static class ReverseIndexReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
        @Override
        protected void reduce(Text key, Iterable<LongWritable> values,
                Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            long count = 0;
            for(LongWritable value: values) {
                count += value.get();
            }
            context.write(key, new LongWritable(count));
        }
    }
    
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(ReverseIndex.class);
        
        job.setMapperClass(ReverseIndexMapper.class);
        job.setReducerClass(ReverseIndexReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        
        //检测输出目录
        Path output = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(output)) {
            fs.delete(output, true);    //递归删除
        }
        
        FileOutputFormat.setOutputPath(job, output);
        
        System.exit(job.waitForCompletion(true)?0:1);
    }
}

实验结果输出:

hadoop jar ri.jar cn.hadoop.ri.ReverseIndex /wc/ri /wc/ro1 
[hadoop@hadoopH1 Hadoop]$ hadoop fs -cat /wc/ro1/part-r-00000
cuda--->c.txt   1
flink--->a.txt  1
flink--->b.txt  1
flink--->c.txt  2
hadoop--->a.txt 3
hadoop--->b.txt 3
hadoop--->c.txt 2
hello--->a.txt  3
hello--->b.txt  1
hello--->c.txt  2
is--->b.txt     1
kitty--->a.txt  1
kitty--->c.txt  1
learn--->c.txt  1
mark--->a.txt   1
mark--->b.txt   1
playing--->b.txt        1
spark--->a.txt  2
spark--->b.txt  1
spark--->c.txt  1
to--->b.txt     1
to--->c.txt     1
tom--->a.txt    1
tom--->b.txt    3
vs--->b.txt     1
vs--->c.txt     2
want--->c.txt   1
with--->a.txt   1
with--->b.txt   1

(二)对文件出现次数进行排序,统一输出

package cn.hadoop.ri;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.hadoop.ri.ReverseIndex.ReverseIndexMapper;
import cn.hadoop.ri.ReverseIndex.ReverseIndexReducer;

public class ReverseIndexStep2 {
    public static class ReverseIndexMapper extends Mapper<LongWritable, Text, Text, Text>{
        @Override
        protected void map(LongWritable key, Text value,
                Mapper<LongWritable, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            //先获取一行数据
            String line = value.toString();
            //进行数据划分
            String[] fields = StringUtils.split(line, "--->");
            String cont = fields[0];
            
            //再次划分
            String[] fields_2 = StringUtils.split(fields[1], "\t");
            String fn = fields_2[0];
            long ct = Long.parseLong(fields_2[1]);
            
            //进行写入
            context.write(new Text(cont), new Text(fn+"--->"+ct));
        }
    }
    
    
    public static class ReverseIndexReducer extends Reducer<Text, Text, Text, Text>{
        protected int compare(String str1,String str2) {
            long c1 = Long.parseLong(StringUtils.split(str1, "--->")[1]);
            long c2 = Long.parseLong(StringUtils.split(str2, "--->")[1]);
            return c1 >= c2 ? 1 : -1;
        }
        
        
        @Override
        protected void reduce(Text key, Iterable<Text> values,
                Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            int i = 0,len = 0;
            String str_t="";
            for(Text value:values) {
                len++;
                str_t += value.toString()+" ";
            }
            
            String[] str_s = StringUtils.split(str_t, " ");
            
            for(i=0; i<len-1; i++) {    //使用冒泡处理排序
                for(int j=1; j<len; j++) {
                    if(compare(str_s[i],str_s[j])==-1) {
                        String tmp = str_s[i];
                        str_s[i] = str_s[j];
                        str_s[j] = tmp;
                    }
                }
            }
            
            String RIRes= "";
            for(i=0;i<len;i++) {
                RIRes += str_s[i] + " ";
            }
            
            context.write(key, new Text(RIRes));
        }
    }
    
    public static void main(String[]  args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setJarByClass(ReverseIndexStep2.class);
        
        job.setMapperClass(ReverseIndexMapper.class);
        job.setReducerClass(ReverseIndexReducer.class);
        
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        
        //检测输出目录
        Path output = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(output)) {
            fs.delete(output, true);    //递归删除
        }
        
        FileOutputFormat.setOutputPath(job, output);
        
        System.exit(job.waitForCompletion(true)?0:1);

    }
}

实验结果输出:

hadoop jar ri.jar cn.hadoop.ri.ReverseIndexStep2 /wc/ro1 /wc/ro2
[hadoop@hadoopH1 Hadoop]$ hadoop fs -cat /wc/ro2/part-r-00000                             
cuda    c.txt--->1 
flink   c.txt--->2 b.txt--->1 a.txt--->1 
hadoop  a.txt--->3 b.txt--->3 c.txt--->2 
hello   a.txt--->3 c.txt--->2 b.txt--->1 
is      b.txt--->1 
kitty   a.txt--->1 c.txt--->1 
learn   c.txt--->1 
mark    a.txt--->1 b.txt--->1 
playing b.txt--->1 
spark   a.txt--->2 b.txt--->1 c.txt--->1 
to      b.txt--->1 c.txt--->1 
tom     b.txt--->3 a.txt--->1 
vs      c.txt--->2 b.txt--->1 
want    c.txt--->1 
with    a.txt--->1 b.txt--->1

 

posted @ 2020-02-25 10:18  山上有风景  阅读(414)  评论(0编辑  收藏  举报