一、基本原理
数据源--MapReduce--数据结果
temperature.txt -- JAR -- part-r-00000
二、实验过程
1. 数据源
气象数据具体的下载地址为 ftp://ftp3.ncdc.noaa.gov/pub/data/noaa/ ,该数据包括1900年到现在所有年份的气象数据,大小大概有70多个G。数据源格式示例如下:
0323999999038531972122912005+30400-081417SAO +0005NRB V02099959000050304849N011200599+00945+00615102435ADDGA1999+030484034GA2999+076204004GD13075+9999999GD24015+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999102375EQDN01 00000JPWTH 1QNNE11 1 00409E11 1 00601E11 1 00099E11 1 00099G11 1 00100H11 1 23100H11 1 32250K11 1 00043L11 1 00700M11 1 30230N11 1 00000Q11 1 10243S11 1 00049V11 1 01010X11 1 00000 0383999999038531972122915005+30400-081417SAO +0005NRB V0200405N003652200059N009600599+01565+01175102545ADDGA1999+076204004GA2999+999999999GA3999+999999999GA4999+999999999GD12015+9999999GD20005+9999999GD30005+9999999GD40005+9999999GF101015999999999999999999MA1999999102475MW1055EQDN01 08100JPWTH 1QNNE11 1 00201E11 1 00000E11 1 00000E11 1 00000G11U1 99999H11 1 32250H11 1 00999H11 1 00999H11 1 00999K11 1 00053L11 1 00600M11 1 30260N11 1 08100Q11 1 10254S11 1 00060V11 1 00101X11 1 04007 0383999999038531972122918005+30400-081417SAO +0005NRB V0200405N003150762049N009600599+01725+01335102535ADDGA1999+030484034GA2999+076204004GA3999+999999999GA4999+999999999GD12015+9999999GD23065+9999999GD30005+9999999GD40005+9999999GF106055999999999999999999MA1999999102475MW1055EQDN01 08100JPWTH 1QNNE11 1 00201E11 1 00407E11 1 00000E11 1 00000G11 1 00250H11 1 23100H11 1 32250H11 1 00999H11 1 00999K11 1 00056L11 1 00600M11 1 30260N11 1 08100Q11 1 10253S11 1 00063V11 1 00806X11 1 04006 0383999999038531972122921005+30400-081417SAO +0005NRB V0200305N003652200059N009600599+01725+01445102385ADDGA1999+999999084GA2999+076204004GA3999+999999999GA4999+999999999GD12005+9999999GD20025+9999999GD30005+9999999GD40005+9999999GF102025999999999999999999MA1999999102345MW1055EQDN01 08100JPWTH 1QNNE11 1 00200E11 1 00002E11 1 00000E11 1 00000G11U1 99999H11 1 11999H11 1 32250H11 1 00999H11 1 00999K11 1 00058L11 1 00600M11 1 30220N11 1 08100Q11 1 10238S11 1 00063V11 1 00202X11 1 03007 0383999999038531972123000005+30400-081417SAO +0005NRB V0200105N002152200059N009600599+01565+01445102535ADDGA1999+009144084GA2999+076204004GA3999+999999999GA4999+999999999GD12015+9999999GD22015+9999999GD30005+9999999GD40005+9999999GF102025999999999999999999MA1999999102475MW1445EQDN01 07200JPWTH 1QNNE11 1 00201E11 1 00201E11 1 00000E11 1 00000G11U1 99999H11 1 11030H11 1 32250H11 1 00999H11 1 00999K11 1 00058L11 1 00600M11 1 30260N11 1 07200Q11 1 10253S11 1 00060V11 1 00202X11 1 01004 0383999999038531972123003005+30400-081417SAO +0005NRB V0200605N001052200059N009600599+01565+01445102515ADDGA1999+076204004GA2999+999999999GA3999+999999999GA4999+999999999GD12015+9999999GD20005+9999999GD30005+9999999GD40005+9999999GF101015999999999999999999MA1999999102445MW1445EQDN01 07200JPWTH 1QNNE11 1 00201E11 1 00000E11 1 00000E11 1 00000G11U1 99999H11 1 32250H11 1 00999H11 1 00999H11 1 00999K11 1 00058L11 1 00600M11 1 30250N11 1 07200Q11 1 10251S11 1 00060V11 1 00101X11 1 06002 0383999999038531972123006005+30400-081417SAO +0005NRB V0200905N002152200059N009600599+01725+01505102375ADDGA1999+004574084GA2999+076204004GA3999+999999999GA4999+999999999GD12025+9999999GD22025+9999999GD30005+9999999GD40005+9999999GF104025999999999999999999MA1999999102305MW1445EQDN01 07200JPWTH 1QNNE11 1 00202E11 1 00203E11 1 00000E11 1 00000G11U1 99999H11 1 11015H11 1 32250H11 1 00999H11 1 00999K11 1 00059L11 1 00600M11 1 30210N11 1 07200Q11 1 10237S11 1 00063V11 1 00503X11 1 09004 0329999999038531972123009005+30400-081417SAO +0005NRB V02099959000050762049N009600599+01445+01335102275ADDGA1999+036584034GA2999+076204004GD12025+9999999GD24065+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999102205MW1445EQDN01 07200JPWTH 1QNNE11 1 00203E11 1 00607E11 1 00099E11 1 00099G11 1 00250H11 1 23120H11 1 32250K11 1 00056L11 1 00600M11 1 30180N11 1 07200Q11 1 10227S11 1 00058V11 1 01010X11 1 00000 0383999999038531972123012005+30400-081417SAO +0005NRB V02099959000050762049N008000599+01335+01335102325ADDGA1999+030484034GA2999+076204004GA3999+999999999GA4999+999999999GD12035+9999999GD23035+9999999GD30005+9999999GD40005+9999999GF106065999999999999999999MA1999999102275MW1445EQDN01 07200JPWTH 1QNNE11 1 00204E11 1 00404E11 1 00000E11 1 00000G11 1 00250H11 1 23100H11 1 32250H11 1 00999H11 1 00999K11 1 00056L11 1 00500M11 1 30200N11 1 07200Q11 1 10232S11 1 00056V11 1 00807X11 1 00000 0356999999038531972123015005+30400-081417SAO +0005NRB V02099959000050152449N009600599+01785+01725102325ADDGA1999+015244064GA2999+030484034GA3999+076204004GD13065+9999999GD23015+9999999GD34015+9999999GD40995+9999999GF108085999999999999999999MA1999999102275MW1445EQDN01 07200JPWTH 1QNNE11 1 00408E11 1 00401E11 1 00601E11 1 00099G11 1 00050H11 1 15050H11 1 23100H11 1 32250K11 1 00063L11 1 00600M11 1 30200N11 1 07200Q11 1 10232S11 1 00064V11 1 01010X11 1 00000 0329999999038531972123018005+30400-081417SAO +0005NRB V0201805N002150152449N009600599+02175+01835102275ADDGA1999+007624084GA2999+015244064GD12015+9999999GD24075+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999102205MW1615EQDN01 02000JPWTH 1QNNE11 1 00201E11 1 00609E11 1 00099E11 1 00099G11 1 00050H11 1 11025H11 1 15050K11 1 00065L11 1 00600M11 1 30180N11 1 02000Q11 1 10227S11 1 00071V11 1 01010X11 1 18004 0377999999038531972123021005+30400-081417SAO +0005NRB V0200905N002150152449N011200599+02005+01615102005ADDGA1999+015244064GA2999+076204004GA3999+999999999GA4999+999999999GD13055+9999999GD23025+9999999GD30005+9999999GD40005+9999999GF107075999999999999999999MA1999999101965EQDN01 00000JPWTH 1QNNE11 1 00406E11 1 00403E11 1 00000E11 1 00000G11 1 00050H11 1 15050H11 1 32250H11 1 00999H11 1 00999K11 1 00061L11 1 00700M11 1 30110N11 1 00000Q11 1 10200S11 1 00068V11 1 00909X11 1 09004 0377999999038531972123100005+30400-081417SAO +0005NRB V0201005N003650762049N011200599+01835+01505102015ADDGA1999+076204004GA2999+999999999GA3999+999999999GA4999+999999999GD13065+9999999GD20005+9999999GD30005+9999999GD40005+9999999GF106045999999999999999999MA1999999101965EQDN01 00000JPWTH 1QNNE11 1 00407E11 1 00000E11 1 00000E11 1 00000G11 1 00250H11 1 32250H11 1 00999H11 1 00999H11 1 00999K11 1 00059L11 1 00700M11 1 30110N11 1 00000Q11 1 10201S11 1 00065V11 1 00705X11 1 10007 0377999999038531972123103005+30400-081417SAO +0005NRB V0201505N004152200059N011200599+01835+01615101875ADDGA1999+076204004GA2999+999999999GA3999+999999999GA4999+999999999GD12025+9999999GD20005+9999999GD30005+9999999GD40005+9999999GF102025999999999999999999MA1999999101835EQDN01 00000JPWTH 1QNNE11 1 00203E11 1 00000E11 1 00000E11 1 00000G11U1 99999H11 1 32250H11 1 00999H11 1 00999H11 1 00999K11 1 00061L11 1 00700M11 1 30070N11 1 00000Q11 1 10187S11 1 00065V11 1 00302X11 1 15008 0329999999038531972123106005+30400-081417SAO +0005NRB V02099959000050152449N009600599+01675+01565101935ADDGA1999+015244064GA2999+076204004GD13055+9999999GD24035+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999101865MW1445EQDN01 07200JPWTH 1QNNE11 1 00406E11 1 00604E11 1 00099E11 1 00099G11 1 00050H11 1 15050H11 1 32250K11 1 00060L11 1 00600M11 1 30080N11 1 07200Q11 1 10193S11 1 00062V11 1 01010X11 1 00000 0323999999038531972123109005+30400-081417SAO +0005NRB V0201905N002650304849N011200599+01945+01615101825ADDGA1999+030484034GA2999+076204004GD13055+9999999GD24035+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999101765EQDN01 00000JPWTH 1QNNE11 1 00406E11 1 00604E11 1 00099E11 1 00099G11 1 00100H11 1 23100H11 1 32250K11 1 00061L11 1 00700M11 1 30050N11 1 00000Q11 1 10182S11 1 00067V11 1 01010X11 1 19005 0323999999038531972123112005+30400-081417SAO +0005NRB V0202005N002650304849N011200599+01835+01615101925ADDGA1999+030484034GA2999+076204004GD13065+9999999GD24025+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999101865EQDN01 00000JPWTH 1QNNE11 1 00407E11 1 00603E11 1 00099E11 1 00099G11 1 00100H11 1 23100H11 1 32250K11 1 00061L11 1 00700M11 1 30080N11 1 00000Q11 1 10192S11 1 00065V11 1 01010X11 1 20005 0323999999038531972123115005+30400-081417SAO +0005NRB V0201605N007750304849N011200599+02115+01675101905ADDGA1999+030484034GA2999+076204004GD13065+9999999GD24025+9999999GD30995+9999999GD40995+9999999GF108085999999999999999999MA1999999101865EQDN01 00000JPWTH 1QNNE11 1 00407E11 1 00603E11 1 00099E11 1 00099G11 1 00100H11 1 23100H11 1 32250K11 1 00062L11 1 00700M11 1 30080N11 1 00000Q11 1 10190S11 1 00070V11 1 01010X11 1 16015 0350999999038531972123118005+30400-081417SAO +0005NRB V0201705N006250365849N011200599+02395+01835101925ADDGA1999+009144084GA2999+036584034GA3999+076204004GD12015+9999999GD23055+9999999GD34025+9999999GD40995+9999999GF108085999999999999999999MA1999999101865EQDN01 00000JPWTH 1QNNE11 1 00201E11 1 00406E11 1 00603E11 1 00099G11 1 00120H11 1 11030H11 1 23120H11 1 32250K11 1 00065L11 1 00700M11 1 30080N11 1 00000Q11 1 10192S11 1 00075V11 1 01010X11 1 17012 0350999999038531972123121005+30400-081417SAO +0005NRB V0201405N003150304849N011200599+02115+01785101745ADDGA1999+009144084GA2999+030484034GA3999+076204004GD12015+9999999GD23065+9999999GD34025+9999999GD40995+9999999GF108085999999999999999999MA1999999101695EQDN01 00000JPWTH 1QNNE11 1 00201E11 1 00407E11 1 00602E11 1 00099G11 1 00100H11 1 11030H11 1 23100H11 1 32250K11 1 00064L11 1 00700M11 1 30030N11 1 00000Q11 1 10174S11 1 00070V11 1 01010X11 1 14006
下面写一个Map-Reduce作业,求每年的最低温度。
2. 编写 JAVA处理类
MinTemperature.java
import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class MinTemperature { public static void main(String[] args) throws Exception { if(args.length != 2) { System.err.println("Usage: MinTemperature<input path> <output path>"); System.exit(-1); } Job job = new Job(); job.setJarByClass(MinTemperature.class); job.setJobName("Min temperature"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(MinTemperatureMapper.class); job.setReducerClass(MinTemperatureReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
Mapper类
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class MinTemperatureMapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private static final int MISSING = 9999; @Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String year = line.substring(15, 19); int airTemperature; if(line.charAt(87) == '+') { airTemperature = Integer.parseInt(line.substring(88, 92)); } else { airTemperature = Integer.parseInt(line.substring(87, 92)); } String quality = line.substring(92, 93); if(airTemperature != MISSING && quality.matches("[01459]")) { context.write(new Text(year), new IntWritable(airTemperature)); } } }
Reduce类
import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class MinTemperatureReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int minValue = Integer.MAX_VALUE; for(IntWritable value : values) { minValue = Math.min(minValue, value.get()); } context.write(key, new IntWritable(minValue)); } }
3. 打JAR包
javac -classpath ../hadoop-core-1.1.2.jar *.java jar cvf ./MinTemperature.jar ./Min*.class
4.设置数据源目录,并将数据文件放入
hadoop fs -mkdir -p /class5/in hadoop fs -copyFromLocal temperature.txt /class5/in hadoop fs -ls /class5/in
5. 运行程序
hadoop jar MinTemperature.jar MinTemperature /class5/in/temperature.txt /class5/out
6. 查看结果
hadoop fs -ls /class5/out hadoop fs -cat /class5/out/part-r-00000
1971 -461 1972 -267 1973 -390