hadoop之join链接

1、数据量大;

  1 import org.apache.hadoop.conf.Configuration;
  2 import org.apache.hadoop.fs.Path;
  3 import org.apache.hadoop.io.NullWritable;
  4 import org.apache.hadoop.io.Text;
  5 import org.apache.hadoop.mapreduce.Job;
  6 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  7 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  8 
  9 /**
 10  *
 11  */
 12 public class App {
 13     public static void main(String[] args) throws Exception {
 14         Configuration conf = new Configuration();
 15         conf.set("fs.defaultFS", "file:///");
 16         conf.set("mapreduce.framework.name", "local");
 17 
 18         Job job = Job.getInstance(conf);
 19         job.setJobName("ReduceJoinApp");
 20         job.setJarByClass(App.class);
 21         //
 22         FileInputFormat.addInputPath(job, new Path("d:/mr/join"));
 23         FileOutputFormat.setOutputPath(job, new Path("d:/mr/join/out"));
 24         //
 25         job.setMapperClass(ReduceJoinMapper.class);
 26         job.setReducerClass(ReduceJoinReducer.class);
 27 
 28         job.setMapOutputKeyClass(ComboKey.class);
 29         job.setMapOutputValueClass(Text.class);
 30 
 31         job.setOutputKeyClass(Text.class);
 32         job.setOutputValueClass(NullWritable.class);
 33 
 34         //分区类
 35         job.setPartitionerClass(CIDPartitioner.class);
 36         //排序类
 37         job.setSortComparatorClass(ComboKeyComparator.class);
 38         //分组类
 39         job.setGroupingComparatorClass(CIDGroupComparator.class);
 40 
 41         job.waitForCompletion(true);
 42     }
 43 }
 44 
 45 
 46 
 47 import org.apache.hadoop.io.WritableComparable;
 48 
 49 import java.io.DataInput;
 50 import java.io.DataOutput;
 51 import java.io.IOException;
 52 
 53 /**
 54  * 组合可以
 55  */
 56 public class ComboKey implements WritableComparable<ComboKey> {
 57     //0-customer 1-order
 58     public int type ;
 59     //cid
 60     public int cid ;
 61     //oid
 62     public int oid ;
 63 
 64     public int compareTo(ComboKey o) {
 65         int type0 = o.type ;
 66         int cid0 = o.cid ;
 67         int oid0 = o.oid ;
 68         //customer
 69         if(type == 0){
 70             //customer
 71             if(type0 == 0){
 72                 return cid - cid0 ;
 73             }
 74             //order
 75             else{
 76                 if(cid == cid0){
 77                     return -1;
 78                 }
 79                 else{
 80                     return cid - cid0 ;
 81                 }
 82             }
 83         }
 84         //order
 85         else{
 86             //customer
 87             if(type0 == 0){
 88                 if(cid == cid0){
 89                     return 1 ;
 90                 }
 91                 else{
 92                     return cid - cid0 ;
 93                 }
 94             }
 95             //orders
 96             else{
 97                 if(cid == cid0){
 98                     return oid - oid0 ;
 99                 }
100                 else{
101                     return cid - cid0 ;
102                 }
103             }
104         }
105     }
106 
107     public void write(DataOutput out) throws IOException {
108         out.writeInt(type);
109         out.writeInt(cid);
110         out.writeInt(oid);
111     }
112 
113     public void readFields(DataInput in) throws IOException {
114         this.type = in.readInt();
115         this.cid = in.readInt();
116         this.oid = in.readInt();
117     }
118 }
119 
120 
121 
122 import org.apache.hadoop.io.WritableComparable;
123 import org.apache.hadoop.io.WritableComparator;
124 
125 /**
126  *
127  */
128 public class ComboKeyComparator extends WritableComparator {
129     protected ComboKeyComparator() {
130         super(ComboKey.class, true);
131     }
132 
133     public int compare(WritableComparable w1, WritableComparable w2) {
134         ComboKey k1 = (ComboKey) w1;
135         ComboKey k2 = (ComboKey) w2;
136         return k1.compareTo(k2);
137     }
138 }
139 
140 
141 
142 import org.apache.hadoop.io.Text;
143 import org.apache.hadoop.mapreduce.Partitioner;
144 
145 
146 /**
147  * Created by Administrator on 2017/6/1.
148  */
149 public class CIDPartitioner extends Partitioner<ComboKey,Text>{
150     public int getPartition(ComboKey key, Text text, int numPartitions) {
151         return key.cid % numPartitions;
152     }
153 }
154 
155 
156 import org.apache.hadoop.io.WritableComparable;
157 import org.apache.hadoop.io.WritableComparator;
158 
159 /**
160  * Created by Administrator on 2017/6/1.
161  */
162 public class CIDGroupComparator extends WritableComparator {
163     protected CIDGroupComparator() {
164         super(ComboKey.class, true);
165     }
166 
167     public int compare(WritableComparable w1, WritableComparable w2) {
168         ComboKey k1 = (ComboKey) w1;
169         ComboKey k2 = (ComboKey) w2;
170         return k1.cid - k2.cid ;
171     }
172 }
173 
174 
175 import org.apache.hadoop.io.LongWritable;
176 import org.apache.hadoop.io.Text;
177 import org.apache.hadoop.mapreduce.Mapper;
178 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
179 
180 import java.io.IOException;
181 
182 /**
183  *
184  */
185 public class ReduceJoinMapper extends Mapper<LongWritable,Text,ComboKey, Text>{
186     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
187         String line = value.toString();
188         String[] arr = line.split(",") ;
189         FileSplit split = (FileSplit) context.getInputSplit();
190         ComboKey k = new ComboKey();
191         //customers
192         if(split.getPath().toString().contains("customers")){
193             k.type = 0 ;
194             k.cid = Integer.parseInt(arr[0]);
195         }
196         //orders
197         else{
198             k.type= 1 ;
199             k.cid = Integer.parseInt(arr[3]);//cid
200             k.oid = Integer.parseInt(arr[0]);//oid
201         }
202         context.write(k, value);
203     }
204 }
205 
206 
207 import org.apache.hadoop.io.NullWritable;
208 import org.apache.hadoop.io.Text;
209 import org.apache.hadoop.mapreduce.Reducer;
210 
211 import java.io.IOException;
212 import java.util.Iterator;
213 
214 
215 /**
216  * Created by Administrator on 2017/6/1.
217  */
218 public class ReduceJoinReducer extends Reducer<ComboKey,Text,Text,NullWritable> {
219 
220     protected void reduce(ComboKey key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
221         Iterator<Text> it = values.iterator();
222         int type = key.type ;
223         String custInfo = null ;
224         //customer
225         if(type == 0){
226             Text custText = it.next();
227             custInfo = custText.toString() ;
228             context.write(new Text(custInfo), NullWritable.get());
229             while(it.hasNext()){
230                 String orderInfo = it.next().toString();
231                 context.write(new Text(custInfo + "," + orderInfo), NullWritable.get());
232             }
233         }
234         //订单(没有客户)
235         else{
236             while(it.hasNext()){
237                 String orderInfo = it.next().toString();
238                 context.write(new Text("null," + orderInfo),NullWritable.get());
239             }
240         }
241     }
242 }

2、数据量小，可直接内存加载

 1 import com.it18zhang.hadoop.mapreduce.inputformat.wholefile.WholeFileInputFormat;
 2 import com.it18zhang.hadoop.mapreduce.inputformat.wholefile.WordCountMapper;
 3 import com.it18zhang.hadoop.mapreduce.inputformat.wholefile.WordCountReducer;
 4 import org.apache.hadoop.conf.Configuration;
 5 import org.apache.hadoop.fs.Path;
 6 import org.apache.hadoop.io.IntWritable;
 7 import org.apache.hadoop.io.NullWritable;
 8 import org.apache.hadoop.io.Text;
 9 import org.apache.hadoop.mapreduce.Job;
10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 
13 import java.io.IOException;
14 
15 /**
16  * Created by Administrator on 2017/6/1.
17  */
18 public class App {
19     public static void main(String[] args) throws Exception {
20         Configuration conf = new Configuration();
21         conf.set("fs.defaultFS", "file:///");
22         conf.set("mapreduce.framework.name", "local");
23         conf.set("customers.data.path", "file:///d:/mr/customers.txt");
24         //创建job
25         Job job = Job.getInstance(conf);
26         job.setJobName("MapJoin");
27         job.setJarByClass(App.class);
28 
29         //添加输入路径(可以多次添加)
30         FileInputFormat.addInputPath(job, new Path("d:/mr/orders.txt"));
31         //设置输出路径，只能一个。而且目录不能存在。
32         FileOutputFormat.setOutputPath(job, new Path("d:/mr/out"));
33 
34         //设置mapper类
35         job.setMapperClass(JoinMapper.class);
36         job.setNumReduceTasks(0);
37 
38         job.setMapOutputKeyClass(Text.class);
39         job.setMapOutputValueClass(NullWritable.class);
40 
41         job.waitForCompletion(true);
42     }
43     
44 }
45 
46 import org.apache.hadoop.conf.Configuration;
47 import org.apache.hadoop.fs.FSDataInputStream;
48 import org.apache.hadoop.fs.FileSystem;
49 import org.apache.hadoop.fs.Path;
50 import org.apache.hadoop.io.LongWritable;
51 import org.apache.hadoop.io.Text;
52 import org.apache.hadoop.io.NullWritable;
53 import org.apache.hadoop.mapreduce.Mapper;
54 
55 import java.io.BufferedReader;
56 import java.io.IOException;
57 import java.io.InputStreamReader;
58 import java.util.HashMap;
59 import java.util.Map;
60 
61 
62 /**
63  *
64  */
65 public class JoinMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
66     //存放所有customers
67     private Map<String,String> allCustomers = new HashMap<String, String>();
68 
69     /**
70      * 读取customers数据到内存中
71      */
72     protected void setup(Context context) throws IOException, InterruptedException {
73         //得到文件系统
74         Configuration conf = context.getConfiguration();
75         FileSystem fs = FileSystem.get(conf);
76         String path = conf.get("customers.data.path") ;
77         FSDataInputStream in = fs.open(new Path(path));
78         BufferedReader reader = new BufferedReader(new InputStreamReader(in));
79         String line = null ;
80         while((line = reader.readLine()) != null){
81             String cid = line.split(",")[0] ;
82             allCustomers.put(cid,line);
83         }
84         reader.close();
85     }
86 
87     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
88         String line = value.toString();
89         String[] arr = line.split(",");
90         //cid
91         String cid = arr[3] ;
92         String custInfo = allCustomers.get(cid);
93         context.write(new Text(custInfo + "," + line),NullWritable.get());
94     }
95 }

posted on 2017-06-02 01:44 艺海浮台阅读(148) 评论(0) 编辑收藏举报

刷新页面返回顶部

艺海浮台

hadoop之join链接

导航

公告