hadoop实战

hadoop实战  上面一份代码是hdfs上text的读写
每个key之间是根据hadoop自带的有序,value根据相同key排序,可以在reduce中执行
partition可以有序的分配reduce,防止一个reduce中数据过大
  1 package com.company;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.IOException;
  5 import java.io.InputStreamReader;
  6 
  7 import org.apache.hadoop.conf.Configuration;
  8 import org.apache.hadoop.fs.FSDataInputStream;
  9 import org.apache.hadoop.fs.FSDataOutputStream;
 10 import org.apache.hadoop.fs.FileSystem;
 11 import org.apache.hadoop.fs.Path;
 12 
 13 
 14 public class Main {
 15     public static void main(String[] args) throws Exception {
 16         try {
 17             Configuration config = new Configuration();
 18             FileSystem hdfs = FileSystem.get(config);
 19             FSDataInputStream hdfsInStream = hdfs.open(new Path("input"));
 20             BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8"));
 21             FSDataOutputStream os = hdfs.create(new Path("output"));
 22 
 23             String line = br.readLine();
 24             while (line != null) {
 25                 line = br.readLine();
 26                 int x = Integer.parseInt("1");
 27                 int y = Integer.parseInt("2");
 28 
 29                 String sp[] = line.split("\t", -1);
 30                 for (int i = 0; i < sp.length; i++) {
 31                     System.out.print(sp[i] + "   ");
 32                 }
 33                 System.out.println("");
 34             }
 35             br.close();
 36             hdfsInStream.close();
 37             os.write("outputtext".getBytes("UTF-8"));
 38         } catch (IOException e) {
 39             e.printStackTrace();
 40         }
 41 
 42 
 43     }
 44 }
 45 
 46 
 47 
 48 
 49 
 50 
 51 
 52 import java.io.IOException;
 53 import java.util.*;
 54 import java.io.BufferedReader;
 55 import java.io.InputStreamReader;
 56 import org.apache.hadoop.fs.Path;
 57 import org.apache.hadoop.conf.*;
 58 import org.apache.hadoop.io.*;
 59 import org.apache.hadoop.fs.FSDataInputStream;
 60 import org.apache.hadoop.fs.FileSystem;
 61 import org.apache.hadoop.mapreduce.*;
 62 import org.apache.hadoop.mapreduce.Partitioner;
 63 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 64 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 65 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 66 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 67 import org.apache.hadoop.mapred.JobConf;
 68 
 69 public class WordCount {
 70 
 71  public static class Map extends Mapper<LongWritable, Text, Text, Text> {
 72 
 73     public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 74         String[] parts = value.toString().split("\t",-1);
 75     if(parts.length != 2)
 76        return;
 77     String Label = "";
 78     String Ip = "";
 79     String Num = parts[1];
 80     int length = Num.length();
 81     int ned = 9 - length;
 82     while(ned-- > 0) {
 83         Num = "0" + Num;
 84     }
 85     String Uid = "";
 86     int dex1 = parts[0].indexOf(' ');
 87     int dex2 = parts[0].indexOf('\1');
 88     Label = parts[0].substring(0,dex1);
 89     Ip = parts[0].substring(dex2+1);
 90     Uid = parts[0].substring(dex1+1,dex2);
 91     Text Word = new Text(Uid + " " + Label);
 92     Text Val = new Text(Num);
 93     Text Word = new Text(Uid + " " + Label);
 94     Text Val = new Text(Num);
 95     context.write(Word, Val);
 96     }
 97  }
 98 
 99   public static class KeyPartitioner extends Partitioner<Text,Text> implements Configurable {
100     int partition[] = new int[15];
101 
102     @Override
103     public void setConf(Configuration config) {//implements Configurable 在partition开始时运行,且只执行1次
104         try { 
105             FileSystem hdfs = FileSystem.get(config);
106             FSDataInputStream hdfsInStream = hdfs.open(new Path("input"));//本地hdfs上的文件
107             BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8"));//读取得到分割点的文件
108 
109             String line = br.readLine();
110             while (line != null) {
111                 String sp[] = line.split(" ", -1);
112                 int x = Integer.parseInt(sp[0]);
113                 int y = Integer.parseInt(sp[1]);
114                 partition[y] = x;
115                 line = br.readLine();
116             }
117             partition[0] = 0;
118             partition[10] = 65535;
119             br.close();
120             hdfsInStream.close();
121         } catch (IOException e) {
122             e.printStackTrace();
123         }
124     }
125 
126         @Override
127     public Configuration getConf() {
128         return null;
129     }
130 
131     /*public void configure(JobConf config) {
132         try {
133             FileSystem hdfs = FileSystem.get(config);
134             FSDataInputStream hdfsInStream = hdfs.open(new Path("/group/lwantispam/shenneng.ysn/operator/data"));
135             BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8"));
136             FSDataInputStream hdfsInStream = hdfs.open(new Path("/group/lwantispam/shenneng.ysn/operator/data"));
137             BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8"));
138 
139             String line = br.readLine();
140             while (line != null) {
141                 String sp[] = line.split(" ", -1);
142                 int x = Integer.parseInt(sp[0]);
143                 int y = Integer.parseInt(sp[1]);
144                 partition[y] = x;
145                 line = br.readLine();
146             }
147             partition[0] = 0;
148             partition[10] = 65535;
149             br.close();
150             hdfsInStream.close();
151         } catch (IOException e) {
152             e.printStackTrace();
153         }
154     }*/
155     public int low(int k) {//取下限
156         int l = 0,r = 10;
157         while(l <= r) {
158             int mid = (l + r) >> 1;
159             if(partition[mid] < k) {
160                 l = mid + 1;
161             } else {
162                 r = mid - 1;
163             }
164         }
165         return r + 1;
166     }
167 
168     public int up(int k) {//取上限
169         int l = 0, r = 10;
170         while(l <= r) {
171             int mid = (l + r) >> 1;
172             if(partition[mid] > k) {
173                 r = mid - 1;
174             } else {
175                 l = mid + 1;
176             }
177         }
178         return l - 1;
179         }
180         return l - 1;
181     }
182 
183     @Override
184     public int getPartition(Text key,Text value,int numPartitions) {
185         int keynumber = Integer.parseInt(value.toString());
186         int left = low(keynumber);
187         int right = up(keynumber);
188         int len = right - left + 1;
189         if (len > 1) {
190             int part = (int) (Math.random() * 1000) % len + left - 1;
191             return part;
192         } else {
193             return left-1;
194         }
195      }
196   }
197 
198  public static class Reduce extends Reducer<Text, Text, Text, Text> {
199 
200     @Override                                            //可以防止重写出错
201     public void reduce(Text key, Iterable<Text> values, Context context)//要照格式
202       throws IOException, InterruptedException {
203         List<Integer> tips = new ArrayList<Integer>();
204         for(Text value : values) {
205             int tmp = Integer.parseInt(value.toString());
206             tips.add(tmp);
207         }
208         Collections.sort(tips);
209         for(int i = 0; i < tips.size(); i++) {
210             System.out.print(String.valueOf(tips.get(i)) + ";");//可以输出在集群
211             context.write(key, new Text(String.valueOf(tips.get(i))));
212         }
213     }
214  }
215 
216  public static void main(String[] args) throws Exception {
217     Configuration conf = new Configuration();
218 
219     Job job = new Job(conf, "wordcount");
220     job.setJarByClass( WordCount.class ) ;
221     job.setOutputKeyClass(Text.class);
222     job.setJarByClass( WordCount.class ) ;
223     job.setOutputKeyClass(Text.class);
224     job.setOutputValueClass(Text.class);
225 
226     job.setMapperClass(Map.class);
227     job.setReducerClass(Reduce.class);
228 
229     job.setNumReduceTasks(10);
230     job.setInputFormatClass(TextInputFormat.class);
231     job.setOutputFormatClass(TextOutputFormat.class);
232     job.setPartitionerClass(KeyPartitioner.class);
233 
234     FileInputFormat.setInputPaths(job, new Path("datainfile"));
235     FileOutputFormat.setOutputPath(job, new Path("dataoutfile"));
236 
237     job.waitForCompletion(true);
238     //System.out.println("hello world");
239  }
240 
241 }
View Code

 finally

many log about
url 1...
wordcount
url sum (get the total)

sum 1... (count all sum)
wordcount
sum Sum
get partition point ---〉

 1 class Go{
 2     int[] item = new int[10009];
 3     int total = 0;
 4     public void add(int x,int y) {
 5         item[x] = y;
 6         total += y;
 7     }
 8 
 9     public String print() {
10         int pre = 0;
11         int step = total/10 + 1;
12         int st = 1;
13         String out = "";
14         for(int i = 1; i < 10000; i++) {
15             pre += item[i];
16             while(st * step <= pre) {
17                 out += i + " " + st + "\n";
18                 st++;
19             }
20         }
21         return out;
22     }
23 }
24 
25 
26 Go getPartition = new Go();
27     try {
28         Configuration config = new Configuration();
29         FileSystem hdfs = FileSystem.get(config);
30          FSDataInputStream hdfsInStream = hdfs.open(new Path("input"));
31          BufferedReader br = new BufferedReader(new InputStreamReader(
32                hdfsInStream, "utf-8"));
33         FSDataOutputStream os = hdfs.create(new Path("output"));
34          String line = br.readLine();
35          while (line != null) {
36             String sp[] = line.split("\t",-1);
37             int x = Integer.parseInt(sp[0]);
38             int y = Integer.parseInt(sp[1]);
39             getPartition.add(x,y);
40             line = br.readLine();
41          }
42          br.close();
43          hdfsInStream.close();
44         String out = getPartition.print();
45         os.write(out.getBytes("UTF-8"));
46       } catch (IOException e) {
47          e.printStackTrace();
48       }
View Code

 

url sum
partition
reduce sort
get order url and order sum;

***the code is on the top**

posted @ 2014-06-27 09:59  gray035  阅读(277)  评论(0编辑  收藏  举报