自己编写MapReduce实现 Hive 的 join

Hive中自带有 join 语法，但是想用mr程序来自己实现一下 join 语法，巩固一下对mr的理解。

比如:

有一个用户表，有三个字段： userId，userName，address

有一个订单表，有四个字段：userId，orderId，categoryId，price

用户信息：

100000001 a addressa
100000002 b addressb
100000003 c addressc

订单信息：

100000001 101 g1 100
100000001 102 g2 200
100000003 103 g3 300
100000002 201 g1 100
100000002 202 g2 200

实现用户表和订单表的 join 操作：

//mr程序读取的每一行都是一串字符串，无法区分是用户信息还是订单信息。我们需要定义一个类，给这一行打上一个标签，标明它是哪种信息。

package com.rabbit.hadoop.hive.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class DataJoinWritable implements WritableComparable<DataJoinWritable>{

private String tag; //标签
private String info; //用户或者订单信息

public DataJoinWritable() {
set(tag, info);
}

public DataJoinWritable(String tag,String info) {

}

public void set(String tag,String info) {
this.tag = tag;
this.info = info;
}

public String getTag() {
return tag;
}

public void setTag(String tag) {
this.tag = tag;
}

public String getInfo() {
return info;
}

public void setInfo(String info) {
this.info = info;
}

public void write(DataOutput out) throws IOException {

out.writeUTF(tag);
out.writeUTF(info);
}

public void readFields(DataInput in) throws IOException {

tag = in.readUTF();
info = in.readUTF();
}

public int compareTo(DataJoinWritable o) {
// TODO Auto-generated method stub
return 0;
}

@Override
public String toString() {
return tag + "," + info;
}

}

//map

package com.rabbit.hadoop.hive.join;

import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class DataJoinMapper extends Mapper<LongWritable, Text, LongWritable, DataJoinWritable> {

/**
* cid cname address
*
* cid oid gname price
*
*/
private LongWritable outputKeys = new LongWritable();
private DataJoinWritable outputValue = new DataJoinWritable();

@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {

String lineValue = value.toString();
String[] fields = lineValue.split(" ");
//简单的按照长度来区分是用户还是订单信息，用户有三个字段，订单有四个字段。如果长度不是3也不是4，则认为是异常数据，丢弃该行数据。
if (fields.length != 3 && fields.length != 4) {
return;
}
//无论是用户还是订单信息，第一个字段都是userId。
Long cid = Long.valueOf(fields[0]);
outputKeys.set(cid);
//长度为3，是用户信息，打上一个标签：customer，保存这条用户信息。
if (fields.length == 3) {
outputValue.set("customer",fields[0]+","+fields[1]+","+fields[2]);
}
//长度为4，是订单信息，打上一个标签：order，保存这条订单信息。
if (fields.length == 4) {
outputValue.set("order", fields[0]+","+fields[1]+","+fields[2]+","+fields[3]);
}
//写出一行数据，key为userId，value是用户或者订单信息。
context.write(outputKeys, outputValue);
}

}

//reduce

package com.rabbit.hadoop.hive.join;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class DataJoinReducer extends Reducer<LongWritable, DataJoinWritable, NullWritable, Text> {

private Text outputValue = new Text();

@Override
protected void reduce(LongWritable key, Iterable<DataJoinWritable> values,Context context) throws IOException, InterruptedException {

String customerInfo = null; //用户信息
ArrayList<String> orders = new ArrayList<String>(); //订单信息。一个用户可以有多个订单，所以用List保存。
//拿到的values集合中既有customer信息，又有order信息，因为key是一样的，都是cid。
for (DataJoinWritable value : values) {

if(value.getTag().equals("customer")) {
customerInfo = value.getInfo();
}else if(value.getTag().equals("order")) {
orders.add(value.getInfo());
}
}
//加上这一段相当于实现了左连接的功能，没有下过订单，就只输出用户信息。
if(orders.isEmpty()) {
outputValue.set(customerInfo);
context.write(NullWritable.get(), outputValue);
}
//内连接的结果
for(String order : orders) {

outputValue.set(customerInfo+","+order);
context.write(NullWritable.get(), outputValue);
}

}

}

//Driver

package com.rabbit.hadoop.hive.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class DataJoinDriver extends Configured implements Tool {

public int run(String[] args) throws Exception {

Configuration configuration = getConf();

Job job = Job.getInstance(configuration,this.getClass().getSimpleName());

job.setJarByClass(DataJoinDriver.class);

FileInputFormat.setInputPaths(job, new Path(args[0]),new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));

job.setMapperClass(DataJoinMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(DataJoinWritable.class);

job.setReducerClass(DataJoinReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);

boolean issuccess = job.waitForCompletion(true);
return issuccess ? 0 : 1;
}

public static void main(String[] args) throws Exception {

Configuration configuration = new Configuration();

args = new String[] {"D:\\input-join\\customer.txt","D:\\input-join\\order.txt","D:\\outputjoin"};

int status = ToolRunner.run(configuration, new DataJoinDriver(), args);

System.exit(status);
}

}

posted @ 2019-03-18 17:30 Rabbit624 阅读(688) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Rabbit624

自己编写MapReduce实现 Hive 的 join

公告