MapReduce_dedup
1 package MapReduce; 2 3 import java.io.IOException; 4 import java.net.URI; 5 6 import org.apache.hadoop.conf.Configuration; 7 import org.apache.hadoop.fs.FileSystem; 8 import org.apache.hadoop.fs.Path; 9 //import org.apache.hadoop.io.IntWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.mapreduce.Job; 12 import org.apache.hadoop.mapreduce.Mapper; 13 import org.apache.hadoop.mapreduce.Reducer; 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 //import org.apache.hadoop.util.GenericOptionsParser; 17 18 public class Dedup { 19 private static final String INPUT_PATH = "hdfs://h201:9000/user/hadoop/input"; 20 private static final String OUTPUT_PATH = "hdfs://h201:9000/user/hadoop/output"; 21 //map将输入中的value复制到输出数据的key上,并直接输出 22 public static class Map extends Mapper<Object,Text,Text,Text>{ 23 private static Text line=new Text();//每行数据 24 //实现map函数 25 public void map(Object key,Text value,Context context) throws IOException,InterruptedException{ 26 line=value; 27 context.write(line, new Text("")); 28 } 29 } 30 //reduce将输入中的key复制到输出数据的key上,并直接输出 31 public static class Reduce extends Reducer<Text,Text,Text,Text>{ 32 //实现reduce函数 33 public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{ 34 context.write(key, new Text(""));//行去重 35 } 36 } 37 public static void main(String[] args) throws Exception{ 38 Configuration conf = new Configuration(); 39 //这句话很关键 40 conf.set("mapred.jar","Dedup.jar"); 41 final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);//读路径信息 42 fileSystem.delete(new Path(OUTPUT_PATH), true);//删除路径信息 输出路径不能存在 43 //String[] ioArgs=new String[]{"dedup_in","dedup_out"};//dedup_in在/user/hadoop下ioArgs对应的是括号内的 44 //String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();//把ioArgs赋给oherArgs,引用Generic这个类,调用getRemain这个方法 conf代表的是初始化 45 //if (otherArgs.length != 2) { 46 //System.err.println("Usage: Data Deduplication <in> <out>"); 47 //System.exit(2); 48 //} 49 final Job job = new Job(conf, WordCountApp.class.getSimpleName()); 50 job.setJarByClass(Dedup.class); //启动job 51 FileInputFormat.setInputPaths(job, INPUT_PATH); 52 //设置Map、Combine和Reduce处理类 53 job.setMapperClass(Map.class); 54 job.setCombinerClass(Reduce.class); 55 job.setReducerClass(Reduce.class); 56 //设置输出类型 57 job.setOutputKeyClass(Text.class); 58 job.setOutputValueClass(Text.class); 59 //设置输入和输出目录 60 FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH)); 61 System.exit(job.waitForCompletion(true) ? 0 : 1); 62 } 63 }
结果
[hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/output/part-r-00000
18/03/18 22:02:59 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
ayi mama
ayi shushu
cai wen wei
didi mama
gege jiejie didi
hello baba
hello mama
hello word
jiejie hello
mama baba jiejie gege
mama jiejie
meimei jiejie
原始数据
[hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/input/counttext1.txt
18/03/18 22:07:08 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
hello baba
hello mama
mama jiejie
jiejie hello
[hadoop@h201 Dedup]$ hadoop fs -cat /user/hadoop/input/counttext.txt
18/03/18 22:07:18 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
hello mama
hello baba
hello word
cai wen wei
mama baba jiejie gege
gege jiejie didi
meimei jiejie
didi mama
ayi shushu
ayi mama
hello mama
hello baba
hello word
cai wen wei
mama baba jiejie gege
gege jiejie didi
meimei jiejie
didi mama
ayi shushu
ayi mama
hello mama
hello baba
hello word
cai wen wei
mama baba jiejie gege
gege jiejie didi
meimei jiejie
didi mama
ayi shushu
ayi mama
hello mama
hello baba
hello word
cai wen wei
mama baba jiejie gege
gege jiejie didi
meimei jiejie
didi mama
ayi shushu
ayi mama
hello mama
hello baba
hello word
cai wen wei
mama baba jiejie gege
gege jiejie didi
meimei jiejie
didi mama
ayi shushu
ayi mama
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 理解Rust引用及其生命周期标识(上)
· 浏览器原生「磁吸」效果!Anchor Positioning 锚点定位神器解析
· 没有源码,如何修改代码逻辑?
· 一个奇形怪状的面试题:Bean中的CHM要不要加volatile?
· [.NET]调用本地 Deepseek 模型
· .NET 10 首个预览版发布,跨平台开发与性能全面提升
· 全程使用 AI 从 0 到 1 写了个小工具
· 快收藏!一个技巧从此不再搞混缓存穿透和缓存击穿
· AI 插件第二弹,更强更好用
· Blazor Hybrid适配到HarmonyOS系统