Hadoop 二次排序
需求
求每年的最高气温,年份升序,温度求最高
数据源内容如下
temperature.txt
2004 49 1981 -22 1981 -31 1965 -47 2027 -2 1964 6 2030 38 2016 -33 1963 13 2000 21 2019 0 2049 43 2039 8 1989 -18 2017 49 1952 -47 2016 -28 1991 20 1967 -39 2022 -47 2041 41 2039 -38 2021 33 1969 38 1981 0 1960 -26 2023 -12 1969 12 1996 -31 1954 -36 2026 34 2013 -4 1969 37 1990 -22 2007 -31 1987 -8 1972 -30 2019 -17 2042 -22 2011 21 2033 -25 2013 10 2047 30 2008 -2 2047 -5 1994 14 1960 7 2037 44 1990 -41 2047 32 2048 -22 1977 -27 2049 35 2023 2 1952 -44 1979 -5 1996 47 2033 8 2006 3 2030 32 1967 43 1980 -6 2001 39 2049 -31 2028 -16 2029 31 1962 -21 2043 -7 2040 34 2001 9 1977 -21 2047 1 2022 30 2002 12 1956 38 2009 7 2049 11 1981 18 2014 -29 1967 -15 2019 2 1975 25 1965 21 2013 -36 2024 -44 1959 10 1992 4 1997 15 2042 17 2013 -14 1993 -21 2027 19 2016 -44 1989 -47 1999 -6 1993 -35 1953 -21 1952 12 1969 -45 2036 10 1950 29 2022 8 1985 -45 2044 -48 1981 -12 2033 -42 1973 -49 2011 27 1958 -26 2028 35 2037 41 1955 -36 2001 -11 1965 23 1970 -14 2015 -2 1969 -19 1997 3 2016 -38 2045 9 1974 6 1956 -39 2012 1 2022 -28 1991 -31 1974 -40 1998 43 2007 12 2049 9 2034 -18 1956 48 1974 40 2009 -24 2030 -44 1957 27 1979 -23 2034 29 2024 -34 2034 -10 2007 42 2000 33 1990 -44 2048 -48 1967 -30 1969 12 2030 26 2023 -36 2029 22 2044 -2 2043 -47 2040 -18 1990 -3 1996 -16 1974 -20 2023 -11 1990 -16 1980 13 2013 -8 2001 41 2015 -30 1974 28 2031 13 1991 -33 1985 -6 1979 -34 2041 12 1957 -46 2014 25 1969 18 1958 -39 1955 -46 2031 39 2032 11 1991 38 2035 -43 2005 -1 2000 2 2027 -28 1984 -8 1985 -47 2045 -6 1987 -21 2004 35 1968 -47 1968 -19 1995 -47 1990 46 1987 18 2012 29 1987 -12 2048 -8 1987 26 2010 18 1959 -20 1978 8 1997 38 1963 24 1991 8 2005 -34 2019 -4 2042 43 1951 6 1956 -32 1952 18 2003 -15 1979 29 2026 35 2032 -26 2044 -25 2039 -36 2021 49 2037 6 2000 -22 2027 34 2024 38 2019 15 1954 -27 2016 49 2018 -43 2048 23 1978 9 1977 5 2047 -30 2028 -12 1991 -25 2022 -36 1974 -2 2038 25 2014 10 2000 -7 2033 16 2020 5 1985 7 1951 -1 1958 -8 1963 -3 1972 10 1986 9 1961 3 1972 -20 1979 -39 1958 44 2027 -48 2007 -50 2025 33 1970 22 2044 27 2043 -48 1950 1 2023 31 2041 -39 2040 43 2025 21 2038 39 1998 16 1987 -50 1967 -40 2021 -27 1961 6 1981 22 1990 7 1993 -49 2001 -5 2003 21 1990 47 1986 -19 2031 37 1987 -14 2019 16 2008 45 2044 1 1977 5 1952 10 2047 5 2044 21 2002 29 1992 28 1980 -2 1952 -47 2008 15 2017 17 1970 1 2045 -37 2016 5 1951 -28 1978 5 1954 9 1966 18 1957 45 1998 -26 1989 0 1964 10 2036 -44 2037 -22 1965 12 2035 40 1994 7 2024 7 1961 4 2007 34 1980 -36 1950 -39 1987 24 1983 -4 2007 46 2009 -5 1974 43 2026 26 1966 -21 2006 -21 1977 -3 1979 -31 2021 33 2040 39 2020 47 1953 -42 1955 2 2017 0 1973 31 1955 4 1973 -7 2027 28 1968 -17 2029 -3 2021 13 1991 9 2030 19 1952 -35 1987 14 1954 -18 2027 -23 1989 12 1983 13 1966 -45 2039 33 2014 34 2012 -30 1953 -7 2020 -21 1987 22 2041 45 2046 0 2017 26 1951 9 2000 -4 1973 27 1972 -3 2036 -14 1974 32 1987 -8 1993 3 1969 17 2011 -11 2038 -50 2040 -8 1950 -22 2036 13 2025 29 1986 27 2038 41 1971 37 1970 45 2045 -21 2036 41 1956 1 2042 -48 1955 -28 1967 -34 1999 -42 1952 -9 1962 -15 1974 -19 1959 19 1965 -42 1962 41 2003 -12 2029 14 1969 26 1992 -4 1959 8 1962 -18 2000 8 2025 -20 2048 -15 1996 25 2017 -23 1992 -10 2001 30 1960 45 2034 33 1983 -47 2046 19 2041 -4 1978 -6 1967 -49 1993 8 1987 -11 2009 3 1990 40 1972 -6 2029 -47 1990 3 2036 4 1981 22 2019 37 1980 -47 2003 -42 1965 -6 2007 45 2040 -45 1984 24 2048 -15 1984 -16 1992 -39 2040 -33 1984 -24 2046 28 2023 -3 1956 46 1969 0 1983 -4 2030 -50 2004 -36 1958 16 2025 -22 1957 -6 2001 -24 2014 -49 1965 16 2043 42 1966 -10 1971 -13 1996 48 1976 11 2026 -43 1982 2 1965 -50 2038 40 2024 -32 1988 3 2004 -45 2039 8 2029 -30 1974 -11 2033 29 1968 -2 2040 -8 1989 -11 1999 7 2001 37 2001 -44 1979 -30 2048 7 1998 -21 2005 49 1975 44 2031 31 1982 12 1987 35 2004 -33 2000 27 2008 34 1970 -26 2047 0 1974 35 1977 -45 1976 19 1956 48 2025 -37 1991 0 2041 -40 1976 38 2016 36 2024 6 2021 14 2005 27 1951 -38 2046 16 1976 26 2044 -44 1989 -47 2025 26 2045 43 2045 -23 2004 30 2044 46 1962 -20 1954 7 1975 -39 1967 18 2038 4 1956 15 2010 -14 2032 -6 1999 19 2024 7 1993 -23 1961 -43 2007 23 1998 9 2027 -29 1950 29 2010 -47 1953 43 2033 -19 1977 28 2013 -36 2001 43 2008 46 2004 19 1985 6 2043 3 2014 -21 1992 7 1990 8 2020 44 1957 -40 2030 5 1996 16 2018 -5 1989 -14 2016 -11 1988 -18 2012 -3 1998 -12 1979 -41 2043 1 1978 -12 1959 -29 2048 -26 1989 -31 2026 33 1960 32 1978 14 2003 36 2012 15 2036 34 2040 -49 1986 7 1982 19 1959 42 2041 23 2037 20 2020 -24 1977 -27 2039 18 2046 2 2017 -23 2012 30 1962 28 1985 42 2023 15 2030 -30 1983 28 1967 26 1990 -11 1968 -50 2038 -11 1995 34 2005 -43 2011 5 1978 9 1952 -48 1955 27 1958 -21 2020 -36 1985 -23 1991 10 1982 -17 1999 3 1999 -25 2005 -11 2048 -14 1985 -18 2006 -5 1970 -21 2026 -26 1956 -20 2043 -50 1982 -24 1998 8 2034 28 1966 -10 2045 5 1968 -49 2001 48 2026 -9 2005 49 2036 39 2027 -45 1972 -24 2009 -49 1961 38 1991 36 1975 37 1978 12 2003 -45 2021 -46 1962 -8 1972 -8 1961 39 2009 23 1995 30 1996 -19 1983 45 1952 19 1974 -24 1992 33 1981 -1 1981 -32 1984 0 2049 -41 2030 13 1993 -27 1980 -45 1964 -10 2013 39 1975 24 1972 43 1977 -33 1962 -44 2016 -22 2029 47 1999 41 2030 -17 2023 36 2018 32 2025 20 1966 14 1986 29 2036 -20 2022 -36 2027 -46 1994 -8 1992 34 2017 1 2021 32 1966 28 1987 -22 1996 26 1991 48 1993 4 1973 -28 1981 -16 2011 45 1963 -14 1986 -50 1984 -26 1980 30 2024 42 1979 31 2030 3 2035 17 2036 30 2017 -43 1997 9 2004 -25 1999 40 1993 16 1965 -42 2043 24 2017 29 2034 -39 1952 -49 2023 26 1999 -31 1986 23 1962 -10 1960 22 2036 -30 2044 38 2014 -50 1986 0 2024 -40 1962 -15 1950 11 2019 30 1980 -16 1992 -18 1994 -40 1989 33 1999 23 1999 -38 2021 -38 2033 17 1995 -2 2034 -9 2017 -36 1956 -41 1961 1 2020 46 1991 -17 2026 2 2004 9 1976 -7 1956 -4 1981 41 2014 0 1975 -41 2005 47 1966 -47 1968 -27 1953 48 2028 32 1963 40 1982 34 2031 27 2008 1 2037 10 2000 -1 2038 -4 2044 -12 1960 -4 2014 10 2038 -42 1964 -48 1994 -47 1953 -30 1987 -24 2038 5 2027 43 1991 7 2015 21 2038 -2 1999 28 2026 -50 1986 25 2041 -24 2029 -1 2008 18 1952 -41 1969 -50 1973 6 1956 -20 1966 -21 1967 44 1967 39 2035 16 1973 -45 2035 38 1958 22 2000 -6 2004 16 2004 16 2037 -38 2028 -47 1957 -41 1985 41 2028 -3 2014 -32 1980 -14 1960 13 2012 10 1960 -27 1983 -6 1953 8 1954 -42 1979 43 1992 -48 1976 19 1964 -11 1970 -14 2042 -10 1990 -36 1987 -8 2023 31 1959 -12 2008 -40 2033 7 2012 46 2002 -3 1992 -35 2044 17 2010 14 2018 -35 1961 26 2004 -24 2045 33 1965 -9 1970 -16 1977 40 2030 -42 2046 -30 1963 36 2019 -47 2020 -12 2026 -27 1994 21 1951 27 1999 -10 1990 36 2003 -8 1984 31 2015 -26 2015 14 1981 -20 1971 -47 2033 -4 1976 -29 2037 25 2013 33 2011 1 2000 -27 2037 31 1960 8 2048 -26 2037 -8 2039 42 1986 -38 2038 13 1984 -44 2049 -43 2012 3 1962 -39 1959 3 1979 -3 1996 -1 1983 27 1950 -43 1957 36 1951 -28 2010 44 2045 -22 2023 0 2038 37 2011 -30 2009 4 1952 47 1965 -35 2005 -35 1954 -9 2040 14 1987 -24 1978 -15 2009 22 1964 48 2003 -38 1969 -20 1983 -47 2030 13 1990 -45 2013 42 1988 -26 2017 9 2041 -43 1964 -20 2005 30 2024 25 2043 26 1993 27 2018 -41 2008 -14 2013 16 2028 44 1967 29 1973 -5 2027 -38 1954 -12 1963 -21 2008 -3 2049 -14 2022 -34 1976 -39 1976 13 2007 30 2032 -15 2007 -7 2028 -37 2012 29 2029 -7 2002 19 2046 -1 1979 0 2008 -17 1980 42 1986 28 1957 -5 1966 48 1994 43 2047 23 2024 -37 1974 -36 2022 -29 2040 -21 2004 12 1978 40 1982 -22 1984 -8 2030 6 1968 -3 1965 32 1998 -15 2039 10 2033 36 1977 36 2045 43 2045 -17 2021 38 1969 -43 2021 -7 2018 10 2008 40 2012 31 2011 28 1999 -36 1985 -18 2008 4 2040 -46 1954 33 2035 -28 1980 -3 2038 20 1959 29 1979 13 2006 8 2029 22 1962 -44 1978 37 1993 -3 1988 23 1991 39 2013 8 1955 43 1973 0 1976 -3 1963 3 2031 -15 2003 31 2002 16 1981 -44 1959 19 2023 -34 2039 4 1994 -21 1951 36 1997 11 2013 13 1950 32 2020 -12 2016 -22 2009 -38 2031 13 1986 -43 1959 28 2049 10 1954 -45 2018 -1 2008 48 2034 -41 1982 -2 1972 -11 2045 -34 1958 10 1997 31 2013 -13 2025 -19 2038 -32 2041 -21 2013 0 2034 3 2036 -23 2008 -22 2034 3 2042 41 2002 1 2043 -2 1950 19 2041 21 2005 -16 2030 -36 2001 45 1964 33 2027 -25 2046 -5 2044 -42 1965 -37 2004 22 2029 46 1966 7 2008 -48 2016 -22 2033 -28 1999 -33 1987 11 1995 18 1969 -13 2023 9 2018 1 2015 39 2017 31 1975 44 1991 32 2045 10 2046 -35 1952 40 1950 -38 1996 -39 2031 14 2037 -48 2002 41
思路
需要排序2次,先比较年份,再比较相同年份下,温度最高的值,也就是说需要对2个纬度的值做排序。但是hadoop只能够在key上进行排序,所以气温和年份的值都得放在key里面,也就是需要创建自定义组合key
pom依赖
<dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.7.3</version> </dependency>
代码
1.自定义key
import org.apache.hadoop.io.WritableComparable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; /** 所有自定义的组合key应该实现接口WritableComparable,WritableComparable接口继承自writable和comparable这两个接 因为writable接口是可序列化的并且可比较的。WritableComparable。组合key按照年份升序按照气温降序 */ public class ComboKey implements WritableComparable<ComboKey> { private int year ; private int temp ; public int getYear() { return year; } public void setYear(int year) { this.year = year; } public int getTemp() { return temp; } public void setTemp(int temp) { this.temp = temp; } /** * 对key进行比较实现 */ public int compareTo(ComboKey o) { int y0 = o.getYear(); int t0 = o.getTemp(); if(year == y0){ //气温降序 return -(temp - t0) ; } //年份升序 else{ return year - y0 ; } } /** * 串行化过程 */ public void write(DataOutput out) throws IOException { //年份 out.writeInt(year); //气温 out.writeInt(temp); } ////反串行化的过程 public void readFields(DataInput in) throws IOException { year = in.readInt(); temp = in.readInt(); } }
2.自定义分区
import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Partitioner; //该分区类按照年份进行分区,相同的年份会进入到同一个分区中去 public class YearPartitioner extends Partitioner<ComboKey,NullWritable> { public int getPartition(ComboKey key, NullWritable nullWritable, int numPartitions) { int year = key.getYear(); return year % numPartitions; } }
3.CombokeyComparator
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; /** 同时完成Combokey中的first和second排序。 */ public class ComboKeyComparator extends WritableComparator { protected ComboKeyComparator() { super(ComboKey.class, true); } public int compare(WritableComparable a, WritableComparable b) { ComboKey k1 = (ComboKey) a; ComboKey k2 = (ComboKey) b; return k1.compareTo(k2); } }
4.分组函数
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; /** 分组在Reduce阶段,构造一个与 Key 相对应的 Value 迭代器的时候,只要year相同就属于同一个组,放在一个Value迭代器,不同的year按照年份升序进行排序。 最后,ComboKey的year相同,但是temp不同的数据会进入一组,并且按temp降序排列。如: 2018 40 2018 38 2018 37 分组后的第一条数据就是我们需要的(也就是reduce阶段的key) */ public class YearGroupComparator extends WritableComparator { protected YearGroupComparator() { super(ComboKey.class, true); } public int compare(WritableComparable a, WritableComparable b) { ComboKey k1 = (ComboKey)a ; ComboKey k2 = (ComboKey)b ; return k1.getYear() - k2.getYear() ; } }
5.Map
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; /** * Map端,输入的(key,value)缩进长度和文本文档,输出的key是组合key,value值是空值 */ public class MaxTempMapper extends Mapper<LongWritable,Text,ComboKey,NullWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { System.out.println("MaxTempMapper.map"); String[] arr = value.toString().split(" "); ComboKey keyout = new ComboKey(); keyout.setYear(Integer.parseInt(arr[0])); keyout.setTemp(Integer.parseInt(arr[1])); context.write(keyout,NullWritable.get()); } }
6.reduce
/** * Reduce端,将组合key切割成key为year,value为气温的一个列表 */ public class MaxTempReducer extends Reducer<ComboKey, NullWritable, IntWritable, IntWritable>{ protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { int year = key.getYear(); int temp = key.getTemp(); for(NullWritable v : values){ System.out.println(key.getYear() + " : " + key.getTemp()); } context.write(new IntWritable(year),new IntWritable(temp)); } }
7.APP
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.File; /** *二次排序 *求每年最高的温度,年份升序,温度求最高 * * 数据格式 2004 49 1981 -22 1981 -31 1965 -47 2017 -2 */ public class APP { public static String run_mode = "local"; //public static String run_mode = "cluster"; public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); //本地 if(run_mode.equals("local")) { File dir = new File("c:\\out"); if (dir.exists()) { APP.delFile(dir); } conf.set("fs.defaultFS", "file:///"); //添加输入路径 FileInputFormat.addInputPath(job,new Path("C://temperature.txt")); //设置输出路径 FileOutputFormat.setOutputPath(job,new Path("C://out")); //linux集群 } else if(run_mode.equals("cluster")) { conf.set("fs.defaultFS", "hdfs://master:9000"); //添加输入路径 FileInputFormat.addInputPath(job,new Path("/temperature.txt")); //设置输出路径 FileOutputFormat.setOutputPath(job,new Path("/out")); } //设置job的各种属性 job.setJobName("WCApp"); //作业名称 job.setJarByClass(APP.class); //搜索类 //job.setInputFormatClass(FileInputFormat.class); //设置输入格式 //添加输入路径 //FileInputFormat.addInputPath(job,new Path(args[0])); //设置输出路径 //FileOutputFormat.setOutputPath(job,new Path(args[1])); job.setMapperClass(MaxTempMapper.class); //mapper类 job.setReducerClass(MaxTempReducer.class); //reducer类 job.setNumReduceTasks(1); //reduce个数 job.setMapOutputKeyClass(ComboKey.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); //设置分区类 job.setPartitionerClass(YearPartitioner.class); //设置分组对比器 job.setGroupingComparatorClass(YearGroupComparator.class); //设置排序对比器(好像不写也行,因为不写的话,WritableComparator的compare方法底层还是会调用自定义key ComboKey.class里的compareTo方法) job.setSortComparatorClass(ComboKeyComparator.class); job.waitForCompletion(true); } static boolean delFile(File file) { if (!file.exists()) { return false; } if (file.isDirectory()) { File[] files = file.listFiles(); for (File f : files) { delFile(f); } } return file.delete(); } }