MapReduce算法形式一:WordCount
MapReduce算法形式一:WordCount
这种形式可以做一些网站登陆次数,或者某个电商网站的商品销量啊诸如此类的,主要就是求和,但是求和之前还是要好好清洗数据的,以免数据缺省值太多,影响真实性。
废话不多说,上代码吧,我把注释一行行的都写了~~可可可可~
先封装了数据行的对象:
public class Log {
private String time;
private String UID;
private String keyWord;
private int rank;
private int order;
private String URL;
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getUID() {
return UID;
}
public void setUID(String uID) {
UID = uID;
}
public String getKeyWord() {
return keyWord;
}
public void setKeyWord(String keyWord) {
this.keyWord = keyWord;
}
public int getRank() {
return rank;
}
public void setRank(int rank) {
this.rank = rank;
}
public int getOrder() {
return order;
}
public void setOrder(int order) {
this.order = order;
}
public String getURL() {
return URL;
}
public void setURL(String uRL) {
URL = uRL;
}
public Log(String time, String uID, String keyWord, int rank, int order,String uRL) {
super();
this.time = time;
this.UID = uID;
this.keyWord = keyWord;
this.rank = rank;
this.order = order;
this.URL = uRL;
}
public Log() {
super();
}
/*
* 对行记录日志信息进行封装成对象
* 并将对象返回
*/
public static Log getInfo(String value){
Log log = new Log();
//将一条日志记录转换成一个数组
String[] lines = value.toString().trim().split("\t");
//判断行记录中间是否有缺省值
if(lines.length == 6){
//行记录封装
log.setTime(lines[0].trim());
log.setUID(lines[1].trim());
log.setKeyWord(lines[2].trim());
log.setRank(Integer.parseInt(lines[3].trim()));
log.setOrder(Integer.parseInt(lines[4].trim()));
log.setURL(lines[5].trim());
}
return log;
}
}
mr中的代码:
public class PVSum {
/**案例一:WordCount
*
* 非空查询条数
* 不去重,直接统计总和即可
*
* 假设:
* 日志格式如下:(已经过清洗,以制表符分割)
* 20111230050630 时间time
* 2a12e06f50ad41063ed2b62bffac29ad 用户UID
* 361泰国电影 搜索的关键词keyword
* 5 rank搜索结果排序
* 8 order点击次数
* http://www.57ge.com/play/?play_2371_1_361.html 访问的URL
*
* @param args
* @throws Exception
*/
public static void main(String[] path) throws Exception {
if(path.length != 2){
System.out.println("please input full path!");
System.exit(0);
}
Job job = Job.getInstance(new Configuration(), PVSum.class.getSimpleName());
job.setJarByClass(PVSum.class);
FileInputFormat.setInputPaths(job, new Path(path[0]));
FileOutputFormat.setOutputPath(job, new Path(path[1]));
job.setMapperClass(PVSumMap.class);
job.setReducerClass(PVSumReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
public static class PVSumMap extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable one = new IntWritable(1);//记录数量,一条记录即为1
Text text = new Text("非空关键词的PV访问量总计:");
protected void map(LongWritable key, Text value,org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws java.io.IOException, InterruptedException {
//获取每条记录的对象
Log log = Log.getInfo(value.toString().trim());
//判断关键字是否为空
if(log.getKeyWord().trim() != null && !log.getKeyWord().trim().equals("")){
//写入数据
context.write(text, one);
//map : <非空关键词的PV访问量总计:, 1>
}
};
}
//shuffle : <非空关键词的PV访问量总计:, {1, 1, 1...}>
public static class PVSumReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, java.lang.Iterable<IntWritable> values,
org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws java.io.IOException, InterruptedException {
int sum = 0;//记录总条数
for (IntWritable count : values) {
sum += count.get();
}
context.write(key, new IntWritable(sum));
};
}
}