hadoop map端join
map端的联结比reduce端的联结实现起来复杂,而且限制也多,一般我们将小表置于内存中, 对于大表的一个纪录我们在内存中查找即可。
改例子摘自hadoop基础教程, 我们实现sales和accounts的联结, 其中sales记录的顾客的销售信息,accounts纪录的是用户的账户信息,我们的目的是统计每个用户消费的次数和消费总额。
数据如下:
sales.txt
002 12.29 2004-07-02 004 13.42 2005-12-20 003 499.99 2010-12-20 001 78.95 2012-04-02 002 21.99 2006-11-30 002 93.45 2008-09-10 001 9.99 2012-05-17
accounts.txt
002 Abigail SmithPremium 2004-07-13 003 April StevensStandard 2010-12-20 004 Nasser HafezPremium 2001-04-23
代码如下:
import java.io.*; import java.util.*; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class MapJoin { public static class MapJoinMapper extends Mapper<Object, Text, Text, Text> { public Map<String, String> joinData = new HashMap(); //执行连接操作 public void map(Object key, Text value, Context context) throws IOException, InterruptedException{ String[] values = value.toString().split("\t"); context.write(new Text(joinData.get(values[0])), value); } //加载小表 public void setup(Context context) throws IOException, InterruptedException{ Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration()); BufferedReader reader = new BufferedReader(new FileReader(path[0].toString())); String str = null; while((str = reader.readLine()) != null) { String[] s = str.split("\t"); joinData.put(s[0], s[1]); } } } public static class MapJoinReducer extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException{ int ci = 0; double total = 0.0; for(Text val : values) { ci ++; String[] v = val.toString().split("\t"); total += Float.parseFloat(v[1]); } String str = String.format("%d\t%f", ci, total); context.write(key, new Text(str)); } } public static void main(String[] args) throws Exception{ Configuration conf = new Configuration(); DistributedCache.addCacheFile(new Path(args[1]).toUri(), conf); Job job = new Job(conf, "MapJoin"); //设置相关类 job.setJarByClass(MapJoin.class); job.setMapperClass(MapJoinMapper.class); job.setReducerClass(MapJoinReducer.class); //设置map输出格式 job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //设置输入输出文件 FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[2])); //等待作业执行完毕 System.exit(job.waitForCompletion(true)?0:1); } }