hadoop实战
hadoop实战 上面一份代码是hdfs上text的读写
每个key之间是根据hadoop自带的有序,value根据相同key排序,可以在reduce中执行
partition可以有序的分配reduce,防止一个reduce中数据过大
1 package com.company; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.InputStreamReader; 6 7 import org.apache.hadoop.conf.Configuration; 8 import org.apache.hadoop.fs.FSDataInputStream; 9 import org.apache.hadoop.fs.FSDataOutputStream; 10 import org.apache.hadoop.fs.FileSystem; 11 import org.apache.hadoop.fs.Path; 12 13 14 public class Main { 15 public static void main(String[] args) throws Exception { 16 try { 17 Configuration config = new Configuration(); 18 FileSystem hdfs = FileSystem.get(config); 19 FSDataInputStream hdfsInStream = hdfs.open(new Path("input")); 20 BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8")); 21 FSDataOutputStream os = hdfs.create(new Path("output")); 22 23 String line = br.readLine(); 24 while (line != null) { 25 line = br.readLine(); 26 int x = Integer.parseInt("1"); 27 int y = Integer.parseInt("2"); 28 29 String sp[] = line.split("\t", -1); 30 for (int i = 0; i < sp.length; i++) { 31 System.out.print(sp[i] + " "); 32 } 33 System.out.println(""); 34 } 35 br.close(); 36 hdfsInStream.close(); 37 os.write("outputtext".getBytes("UTF-8")); 38 } catch (IOException e) { 39 e.printStackTrace(); 40 } 41 42 43 } 44 } 45 46 47 48 49 50 51 52 import java.io.IOException; 53 import java.util.*; 54 import java.io.BufferedReader; 55 import java.io.InputStreamReader; 56 import org.apache.hadoop.fs.Path; 57 import org.apache.hadoop.conf.*; 58 import org.apache.hadoop.io.*; 59 import org.apache.hadoop.fs.FSDataInputStream; 60 import org.apache.hadoop.fs.FileSystem; 61 import org.apache.hadoop.mapreduce.*; 62 import org.apache.hadoop.mapreduce.Partitioner; 63 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 64 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 65 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 66 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 67 import org.apache.hadoop.mapred.JobConf; 68 69 public class WordCount { 70 71 public static class Map extends Mapper<LongWritable, Text, Text, Text> { 72 73 public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 74 String[] parts = value.toString().split("\t",-1); 75 if(parts.length != 2) 76 return; 77 String Label = ""; 78 String Ip = ""; 79 String Num = parts[1]; 80 int length = Num.length(); 81 int ned = 9 - length; 82 while(ned-- > 0) { 83 Num = "0" + Num; 84 } 85 String Uid = ""; 86 int dex1 = parts[0].indexOf(' '); 87 int dex2 = parts[0].indexOf('\1'); 88 Label = parts[0].substring(0,dex1); 89 Ip = parts[0].substring(dex2+1); 90 Uid = parts[0].substring(dex1+1,dex2); 91 Text Word = new Text(Uid + " " + Label); 92 Text Val = new Text(Num); 93 Text Word = new Text(Uid + " " + Label); 94 Text Val = new Text(Num); 95 context.write(Word, Val); 96 } 97 } 98 99 public static class KeyPartitioner extends Partitioner<Text,Text> implements Configurable { 100 int partition[] = new int[15]; 101 102 @Override 103 public void setConf(Configuration config) {//implements Configurable 在partition开始时运行,且只执行1次 104 try { 105 FileSystem hdfs = FileSystem.get(config); 106 FSDataInputStream hdfsInStream = hdfs.open(new Path("input"));//本地hdfs上的文件 107 BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8"));//读取得到分割点的文件 108 109 String line = br.readLine(); 110 while (line != null) { 111 String sp[] = line.split(" ", -1); 112 int x = Integer.parseInt(sp[0]); 113 int y = Integer.parseInt(sp[1]); 114 partition[y] = x; 115 line = br.readLine(); 116 } 117 partition[0] = 0; 118 partition[10] = 65535; 119 br.close(); 120 hdfsInStream.close(); 121 } catch (IOException e) { 122 e.printStackTrace(); 123 } 124 } 125 126 @Override 127 public Configuration getConf() { 128 return null; 129 } 130 131 /*public void configure(JobConf config) { 132 try { 133 FileSystem hdfs = FileSystem.get(config); 134 FSDataInputStream hdfsInStream = hdfs.open(new Path("/group/lwantispam/shenneng.ysn/operator/data")); 135 BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8")); 136 FSDataInputStream hdfsInStream = hdfs.open(new Path("/group/lwantispam/shenneng.ysn/operator/data")); 137 BufferedReader br = new BufferedReader(new InputStreamReader(hdfsInStream, "utf-8")); 138 139 String line = br.readLine(); 140 while (line != null) { 141 String sp[] = line.split(" ", -1); 142 int x = Integer.parseInt(sp[0]); 143 int y = Integer.parseInt(sp[1]); 144 partition[y] = x; 145 line = br.readLine(); 146 } 147 partition[0] = 0; 148 partition[10] = 65535; 149 br.close(); 150 hdfsInStream.close(); 151 } catch (IOException e) { 152 e.printStackTrace(); 153 } 154 }*/ 155 public int low(int k) {//取下限 156 int l = 0,r = 10; 157 while(l <= r) { 158 int mid = (l + r) >> 1; 159 if(partition[mid] < k) { 160 l = mid + 1; 161 } else { 162 r = mid - 1; 163 } 164 } 165 return r + 1; 166 } 167 168 public int up(int k) {//取上限 169 int l = 0, r = 10; 170 while(l <= r) { 171 int mid = (l + r) >> 1; 172 if(partition[mid] > k) { 173 r = mid - 1; 174 } else { 175 l = mid + 1; 176 } 177 } 178 return l - 1; 179 } 180 return l - 1; 181 } 182 183 @Override 184 public int getPartition(Text key,Text value,int numPartitions) { 185 int keynumber = Integer.parseInt(value.toString()); 186 int left = low(keynumber); 187 int right = up(keynumber); 188 int len = right - left + 1; 189 if (len > 1) { 190 int part = (int) (Math.random() * 1000) % len + left - 1; 191 return part; 192 } else { 193 return left-1; 194 } 195 } 196 } 197 198 public static class Reduce extends Reducer<Text, Text, Text, Text> { 199 200 @Override //可以防止重写出错 201 public void reduce(Text key, Iterable<Text> values, Context context)//要照格式 202 throws IOException, InterruptedException { 203 List<Integer> tips = new ArrayList<Integer>(); 204 for(Text value : values) { 205 int tmp = Integer.parseInt(value.toString()); 206 tips.add(tmp); 207 } 208 Collections.sort(tips); 209 for(int i = 0; i < tips.size(); i++) { 210 System.out.print(String.valueOf(tips.get(i)) + ";");//可以输出在集群 211 context.write(key, new Text(String.valueOf(tips.get(i)))); 212 } 213 } 214 } 215 216 public static void main(String[] args) throws Exception { 217 Configuration conf = new Configuration(); 218 219 Job job = new Job(conf, "wordcount"); 220 job.setJarByClass( WordCount.class ) ; 221 job.setOutputKeyClass(Text.class); 222 job.setJarByClass( WordCount.class ) ; 223 job.setOutputKeyClass(Text.class); 224 job.setOutputValueClass(Text.class); 225 226 job.setMapperClass(Map.class); 227 job.setReducerClass(Reduce.class); 228 229 job.setNumReduceTasks(10); 230 job.setInputFormatClass(TextInputFormat.class); 231 job.setOutputFormatClass(TextOutputFormat.class); 232 job.setPartitionerClass(KeyPartitioner.class); 233 234 FileInputFormat.setInputPaths(job, new Path("datainfile")); 235 FileOutputFormat.setOutputPath(job, new Path("dataoutfile")); 236 237 job.waitForCompletion(true); 238 //System.out.println("hello world"); 239 } 240 241 }
finally
many log about
url 1...
wordcount
url sum (get the total)
sum 1... (count all sum)
wordcount
sum Sum
get partition point ---〉
1 class Go{ 2 int[] item = new int[10009]; 3 int total = 0; 4 public void add(int x,int y) { 5 item[x] = y; 6 total += y; 7 } 8 9 public String print() { 10 int pre = 0; 11 int step = total/10 + 1; 12 int st = 1; 13 String out = ""; 14 for(int i = 1; i < 10000; i++) { 15 pre += item[i]; 16 while(st * step <= pre) { 17 out += i + " " + st + "\n"; 18 st++; 19 } 20 } 21 return out; 22 } 23 } 24 25 26 Go getPartition = new Go(); 27 try { 28 Configuration config = new Configuration(); 29 FileSystem hdfs = FileSystem.get(config); 30 FSDataInputStream hdfsInStream = hdfs.open(new Path("input")); 31 BufferedReader br = new BufferedReader(new InputStreamReader( 32 hdfsInStream, "utf-8")); 33 FSDataOutputStream os = hdfs.create(new Path("output")); 34 String line = br.readLine(); 35 while (line != null) { 36 String sp[] = line.split("\t",-1); 37 int x = Integer.parseInt(sp[0]); 38 int y = Integer.parseInt(sp[1]); 39 getPartition.add(x,y); 40 line = br.readLine(); 41 } 42 br.close(); 43 hdfsInStream.close(); 44 String out = getPartition.print(); 45 os.write(out.getBytes("UTF-8")); 46 } catch (IOException e) { 47 e.printStackTrace(); 48 }
url sum
partition
reduce sort
get order url and order sum;
***the code is on the top**