mapreduce分析百度百家作者分布的文章数和总阅读量

主要完成的任务是从一个文件中读取数据后，去重，然后分析出作者发布的文章数量和总的阅读量之后，把分析结果保存在另一个文件中

首先创建一个flowbean

package flow;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

public class FlowBean implements WritableComparable<FlowBean>{
    
    
    private String userName;
    private long numerRead;
    private long numberArticle;
    
    //在反序列化时，反射机制需要调用空参构造函数，所以显示定义了一个空参构造函数
    public FlowBean(){}
    
    //为了对象数据的初始化方便，加入一个带参的构造函数
    public FlowBean(String userName, long numerRead, long numberArticle) {
        this.userName = userName;
        this.numerRead = numerRead;
        this.numberArticle = numberArticle;
    }

    public String getUserName() {
        return userName;
    }

    public void setUserName(String userName) {
        this.userName = userName;
    }

    public long getNumerRead() {
        return numerRead;
    }

    public void setNumerRead(long numerRead) {
        this.numerRead = numerRead;
    }

    public long getNumberArticle() {
        return numberArticle;
    }

    public void setNumberArticle(long numberArticle) {
        this.numberArticle = numberArticle;
    }


    
    
    //将对象数据序列化到流中
    @Override
    public void write(DataOutput out) throws IOException {

        out.writeUTF(userName);
        out.writeLong(numerRead);
        out.writeLong(numberArticle);
        
    }

    
    //从数据流中反序列出对象的数据
    //从数据流中读出对象字段时，必须跟序列化时的顺序保持一致
    @Override
    public void readFields(DataInput in) throws IOException {

        userName = in.readUTF();
        numerRead = in.readLong();
        numberArticle = in.readLong();
        
    }

    @Override
    public String toString() {

        return "" + userName + "\t" +numerRead + "\t" + numberArticle;
    }

    @Override
    public int compareTo(FlowBean o) {
        return numerRead>o.getNumerRead()?-1:1;
    }
    

}

然后创建map和reduce

package flow;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import flow.RemoveDup.RemoveDupMapper;
import flow.RemoveDup.RemoveDupReducer;

public class FlowSumRunner extends Configured implements Tool{

     public static class RemoveDupMapper
     extends Mapper<Object, Text, Text, NullWritable> {

     public void map(Object key, Text value, Context context)
         throws IOException, InterruptedException {
         context.write(value, NullWritable.get());
     }
     }

     public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
         public void reduce(Text key, Iterable<NullWritable> values, Context context)
         throws IOException, InterruptedException {
     context.write(key, NullWritable.get());
     //System.out.println("reduce: key=" + key);
         }
     }
     
     /**
      * FlowBean 是我们自定义的一种数据类型，要在hadoop的各个节点之间传输，应该遵循hadoop的序列化机制
      * 就必须实现hadoop相应的序列化接口
      * @author duanhaitao@itcast.cn
      *
      */
     public static class FlowSumMapper extends Mapper<LongWritable, Text, Text, FlowBean>{

         @Override
         protected void map(LongWritable key, Text value,Context context)
                 throws IOException, InterruptedException {

             //拿一行数据
             String line = value.toString();
             //切分成各个字段
             String[] fields = StringUtils.split(line, "  ");
             
             //拿到我们需要的字段
             String username = fields[1];
             long userRead = Long.parseLong(fields[2]);
         
             //封装数据为kv并输出
             context.write(new Text(username), new FlowBean(username,userRead,1));
             System.out.println(username + " " + userRead );
         }
         
         
     }


    public static class FlowSumReducer extends Reducer<Text, FlowBean, FlowBean, Text>{
    
    @Override
    protected void reduce(Text key, Iterable<FlowBean> values,Context context)
            throws IOException, InterruptedException {

        long numberRead_counter = 0;
        long numberArticle_counter = 0;
        
        for(FlowBean bean : values){
            
            numberRead_counter += bean.getNumerRead();
            numberArticle_counter += bean.getNumberArticle();
            
        }
        
        
        context.write(new FlowBean(key.toString(), numberRead_counter, numberArticle_counter), new Text());
    }

}

    
    @Override
    public int run(String[] args) throws Exception {
        
        Configuration conf = new Configuration();    
        
        Job job = Job.getInstance(conf, "RemoveDup");
        job.setJarByClass(RemoveDup.class);
        job.setMapperClass(RemoveDupMapper.class);
        job.setCombinerClass(RemoveDupReducer.class);
        job.setReducerClass(RemoveDupReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        FileInputFormat.setInputPaths(job, new Path("/Users/lihu/Desktop/crawle/tap.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/Users/lihu/Desktop/crawle/quchong"));
        
        Job job1 = Job.getInstance(conf);
        
        job1.setJarByClass(FlowSumRunner.class);
        
        job1.setMapperClass(FlowSumMapper.class);
        job1.setReducerClass(FlowSumReducer.class);
        
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(FlowBean.class);
        
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(FlowBean.class);
        
        FileInputFormat.setInputPaths(job1, new Path("/Users/lihu/Desktop/crawle/quchong"));
        FileOutputFormat.setOutputPath(job1, new Path("/Users/lihu/Desktop/crawle/logs"));
        //提交job1及job2,并等待完成
        if (job.waitForCompletion(true)) {
            return job1.waitForCompletion(true)?0:1;
        }
        
        return 0;
    }
    
    
    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new Configuration(), new FlowSumRunner(), args);
        System.exit(res);
    }

}

posted on 2017-02-03 15:05 sunyaxue 阅读(288) 评论(0) 编辑收藏举报

刷新页面返回顶部

伊米伊念

mapreduce分析百度百家作者分布的文章数和总阅读量

导航

公告