自己编写MapReduce实现 Hive 的 join

Hive中自带有 join 语法,但是想用mr程序来自己实现一下 join 语法,巩固一下对mr的理解。

比如:

有一个用户表 ,有三个字段: userId,userName,address

有一个订单表,有四个字段:userId,orderId,categoryId,price

用户信息:

100000001 a addressa
100000002 b addressb
100000003 c addressc

订单信息:

100000001 101 g1 100
100000001 102 g2 200
100000003 103 g3 300
100000002 201 g1 100
100000002 202 g2 200

实现用户表和订单表的 join 操作:

//mr程序读取的每一行都是一串字符串,无法区分是用户信息还是订单信息。我们需要定义一个类,给这一行打上一个标签,标明它是哪种信息。

package com.rabbit.hadoop.hive.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class DataJoinWritable implements WritableComparable<DataJoinWritable>{

private String tag; //标签
private String info; //用户或者订单信息

public DataJoinWritable() {
set(tag, info);
}

public DataJoinWritable(String tag,String info) {

}

public void set(String tag,String info) {
this.tag = tag;
this.info = info;
}

public String getTag() {
return tag;
}

public void setTag(String tag) {
this.tag = tag;
}

public String getInfo() {
return info;
}

public void setInfo(String info) {
this.info = info;
}

public void write(DataOutput out) throws IOException {

out.writeUTF(tag);
out.writeUTF(info);
}

public void readFields(DataInput in) throws IOException {

tag = in.readUTF();
info = in.readUTF();
}

public int compareTo(DataJoinWritable o) {
// TODO Auto-generated method stub
return 0;
}

@Override
public String toString() {
return tag + "," + info;
}

}

//map

package com.rabbit.hadoop.hive.join;

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class DataJoinMapper extends Mapper<LongWritable, Text, LongWritable, DataJoinWritable> {

/**
* cid cname address
*
* cid oid gname price
*
*/
private LongWritable outputKeys = new LongWritable();
private DataJoinWritable outputValue = new DataJoinWritable();

@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {

String lineValue = value.toString();
String[] fields = lineValue.split(" ");
//简单的按照长度来区分是用户还是订单信息,用户有三个字段,订单有四个字段。如果长度不是3也不是4,则认为是异常数据,丢弃该行数据。
if (fields.length != 3 && fields.length != 4) {
return;
}
//无论是用户还是订单信息,第一个字段都是userId。
Long cid = Long.valueOf(fields[0]);
outputKeys.set(cid);
//长度为3,是用户信息,打上一个标签 :customer,保存这条用户信息。
if (fields.length == 3) {
outputValue.set("customer",fields[0]+","+fields[1]+","+fields[2]);
}
//长度为4,是订单信息,打上一个标签:order,保存这条订单信息。
if (fields.length == 4) {
outputValue.set("order", fields[0]+","+fields[1]+","+fields[2]+","+fields[3]);
}
//写出一行数据,key为userId,value是用户或者订单信息。
context.write(outputKeys, outputValue);
}

}

//reduce

package com.rabbit.hadoop.hive.join;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

 

public class DataJoinReducer extends Reducer<LongWritable, DataJoinWritable, NullWritable, Text> {

private Text outputValue = new Text();

@Override
protected void reduce(LongWritable key, Iterable<DataJoinWritable> values,Context context) throws IOException, InterruptedException {

String customerInfo = null; //用户信息
ArrayList<String> orders = new ArrayList<String>(); //订单信息。一个用户可以有多个订单,所以用List保存。
//拿到的values集合中既有customer信息,又有order信息,因为key是一样的,都是cid。
for (DataJoinWritable value : values) {

if(value.getTag().equals("customer")) {
customerInfo = value.getInfo();
}else if(value.getTag().equals("order")) {
orders.add(value.getInfo());
}
}
//加上这一段相当于实现了左连接的功能,没有下过订单,就只输出用户信息。
if(orders.isEmpty()) {
outputValue.set(customerInfo);
context.write(NullWritable.get(), outputValue);
}
//内连接的结果
for(String order : orders) {

outputValue.set(customerInfo+","+order);
context.write(NullWritable.get(), outputValue);
}

}

}

//Driver

package com.rabbit.hadoop.hive.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class DataJoinDriver extends Configured implements Tool {

public int run(String[] args) throws Exception {

Configuration configuration = getConf();

Job job = Job.getInstance(configuration,this.getClass().getSimpleName());

job.setJarByClass(DataJoinDriver.class);

FileInputFormat.setInputPaths(job, new Path(args[0]),new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));

job.setMapperClass(DataJoinMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DataJoinWritable.class);

job.setReducerClass(DataJoinReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);

boolean issuccess = job.waitForCompletion(true);
return issuccess ? 0 : 1;
}


public static void main(String[] args) throws Exception {

Configuration configuration = new Configuration();

args = new String[] {"D:\\input-join\\customer.txt","D:\\input-join\\order.txt","D:\\outputjoin"};

int status = ToolRunner.run(configuration, new DataJoinDriver(), args);

System.exit(status);
}

}

posted @ 2019-03-18 17:30  Rabbit624  阅读(688)  评论(0编辑  收藏  举报