[hadoop](2) MapReducer:Distributed Cache

前言

本章主要内容是讲述hadoop的分布式缓存的使用，通过分布式缓存可以将一些需要共享的数据在各个集群中共享。

准备工作

数据集：ufo-60000条记录，这个数据集有一系列包含下列字段的UFO目击事件记录组成，每条记录的字段都是以tab键分割，请看http://www.cnblogs.com/cafebabe-yun/p/8679994.html

sighting date：UFO目击事件发生时间
Recorded date：报告目击事件的时间
Location：目击事件发生的地点
Shape：UFO形状
Duration：目击事件持续时间
Dexcription：目击事件的大致描述

例子：

19950915 19950915 Redmond, WA 6 min. Young man w/ 2 co-workers witness tiny, distinctly white round disc drifting slowly toward NE. Flew in dir. 90 deg. to winds.

需要共享的数据：州名缩写与全称的对应关系

数据：

AL      Alabama
AK      Alaska
AZ      Arizona
AR      Arkansas
CA      California

Distributed Cache介绍

作用：使用分布式缓存，可以将map和reduce任务要用的通用只读文件在集群所有节点共享。

Distributed Cache的使用

题目：使用共享数据替换州名缩写

将上面提到的共享数据保存为 states.txt 文件
将states.txt文件上传到hadoop

hadoop dfs -put states.txt states.txt

编写 UFORecordValidationMapper.java

import java.io.IOException;

import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.*;

public class UFORecordValidationMapper extends MapReduceBase implements Mapper<LongWritable, Text, LongWritable, Text> {
    public void map(LongWritable key, Text value, OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {
        String line = value.toString();
        if(validate(line)) {
            output.collect(key, value);
        }
    }
    
    private boolean validate(String str) {
        String[] parts = str.split("\t");
        if(parts.length != 6) {
            return false;
        }
        return true;
    }
}

编写 UFOLocation2.java

import java.io.*;
import java.util.*;
import java.net.*;
import java.util.regex.*;

import org.apache.hadoop.conf.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapred.lib.*;

public class UFOLocation2 {
    public static class MapClass extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable> {
        private final static LongWritable one = new LongWritable(1);
        private static Pattern locationPattern = Pattern.compile("[a-zA-Z]{2}[^a-zA-Z]*$");
        private Map<String, String> stateNames;    
    
        @Override
        public void configure(JobConf job) {
            try {
                Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
                setupStateMap(cacheFiles[0].toString());
            } catch (IOException e) {
                System.err.println("Error reading state file.");
                System.exit(1);
            }
        }

        private void setupStateMap(String fileName) throws IOException {
            Map<String, String> stateCache = new HashMap<String, String>();
            BufferedReader reader = new BufferedReader(new FileReader(fileName));
            String line = null;
            while((line = reader.readLine()) != null) {
                String[] splits = line.split("\t");
                stateCache.put(splits[0], splits[1]);
            }
            stateNames = stateCache;
        }
        
        @Override
        public void map(LongWritable key, Text value, OutputCollector<Text, LongWritable> output, Reporter reporter) throws IOException {
            String line = value.toString();
            String[] fields = line.split("\t");
            String location = fields[2].trim();
            if(location.length() >= 2) {
                Matcher matcher = locationPattern.matcher(location);
                if(matcher.find()) {
                    int start = matcher.start();
                    String state = location.substring(start, start + 2);
                    output.collect(new Text(lookupState(state.toUpperCase())), one);
                }
            }
        }
        
        private String lookupState(String state) {
            String fullName = stateNames.get(state);
            if(fullName == null || "".equals(fullName)) {
                fullName = state;
            }
            return fullName;
        }
    }

    public static void main(String...args) throws Exception {
        Configuration config = new Configuration();
        JobConf conf = new JobConf(config, UFOLocation2.class);
        conf.setJobName("UFOLocation2");
        DistributedCache.addCacheFile(new URI("/user/root/states.txt"), conf);
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(LongWritable.class);

        JobConf mapconf1 = new JobConf(false);
        ChainMapper.addMapper(conf, UFORecordValidationMapper.class, LongWritable.class, Text.class, LongWritable.class, Text.class, true, mapconf1);
        JobConf mapconf2 = new JobConf(false);
        ChainMapper.addMapper(conf, MapClass.class, LongWritable.class, Text.class, Text.class, LongWritable.class, true, mapconf2);
        conf.setMapperClass(ChainMapper.class);
        conf.setCombinerClass(LongSumReducer.class);
        conf.setReducerClass(LongSumReducer.class);

        FileInputFormat.setInputPaths(conf, args[0]);
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));
        JobClient.runJob(conf);
    }
}

编译上述两个文件

javac UFORecordValidationMapper.java UFOLocation2.java

将编译好的文件打包成jar

jar cvf ufo.jar UFO*class

提交打包好的jar包到hadoop上运行

hadoop jar ufo.jar UFOLocation2 ufo.tsv output

从hadoop上获取结果到本地

hadoop dfs -get output/part-00000 ufo_result.txt

查看结果

more ufo_result.txt

posted on 2018-03-31 17:52 飘云粟阅读(191) 评论(0) 编辑收藏举报