WordCount基于本地和java的使用
直接使用hadoop中的wordcount中的jar包进行使用
JAVA实现WordCount
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class Demo1 { // map类 // 第一对kv,是决定数据输入的格式 // 第二队kv 是决定数据输出的格式 public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //一行一行的读,先LongWrtable再value是因为第一个LongWritable是偏移量 String line=value.toString(); //需要读出内容和行数1,所以要对结果进行类型转换 context.write(new Text(line),new LongWritable(1)); } } // reduce类 // 用来接收map端输出的数据 public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{ /** * reduce 聚合程序 每一个k都会调用一次 * 默认是一个节点 * key:每一个单词 * values:map端 当前k所对应的所有的v */ @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum=0; //进行整合后values值变成(key,1,1,1,1),values需要遍历 for (LongWritable value : values) { //这里同理需要将value转换类型,LongWritable是一个接口可以用get方法转为long型整数 sum+=value.get(); } //同理long类型sum转换为LongWritable类型 context.write(key,new LongWritable(sum)); } } //mapreduce的程序入口 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { //创造一个job任务 Job job = Job.getInstance(); //命名job名称 job.setJobName("第一次通过自己的jar包连接"); //指定当前main坐在类端口 job.setJarByClass(Demo1.class); //指定map类端口 job.setMapperClass(map.class); //指定map输出的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); //指定reduce类端口 job.setReducerClass(reduce.class); //指定reduce输出的kv类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //指定输入路径 hdfs路径 Path in = new Path("/wordcount"); FileInputFormat.addInputPath(job,in); //指定输出路径 Path out = new Path("/output1"); //如果路径存在,进行删除操作 FileSystem fs = FileSystem.get(new Configuration()); if (fs.exists(out)){ fs.delete(out,true); //true可以删除多级目录 } FileOutputFormat.setOutputPath(job,out); //启动任务 job.waitForCompletion(true); /** * 提交任务 * 1.通过maven中package将项目打包上传服务器然后执行 * 2.执行任务 hadoop jar hadoop-mapreduce-examples-2.7.6.jar com.shujia.hadoop.Demo01WordCount /word /output * */ System.out.println("wordcount实现成功"); } }
实现玩代码后进行打包,打完后的包xftp上传到
/usr/local/soft/hadoop-2.7.6/share/hadoop/mapreduce
开始正式对包进行解析(jar)
路径在idea中查看,是mian函数的路径
对数据进行逗号分隔代码
只需对map阶段进行操作即可
public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String s = value.toString(); String[] split = s.split(","); for (String s1 : split) { context.write(new Text(s1),new LongWritable(1)); } } }
对students中clazz中年龄的总和
public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String s = value.toString(); String[] split = s.split(","); String s1 = split[2]; LongWritable age = new LongWritable(Integer.valueOf(s1)); String s2 = split[4]; Text clazz = new Text(s2); context.write(clazz, age); } }
对students.txt中进行男女性别人数的统计
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo4 { public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String s = value.toString().split(",")[3]; context.write(new Text(s),new LongWritable(1)); } } public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum=0l; for (LongWritable value : values) { sum+=value.get(); } context.write(key,new LongWritable(sum)); } } public static void main(String[] args) throws Exception{ Job job = Job.getInstance(); job.setJobName("男女性别人数的统计"); job.setJarByClass(Demo4.class); job.setMapperClass(map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); Path in = new Path("/data/students.txt"); FileInputFormat.addInputPath(job,in); Path out = new Path("/output4"); FileSystem fs = FileSystem.get(new Configuration()); if (fs.exists(out)){ fs.delete(out,true); } FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true); System.out.println("第四个了"); } }
Students.txt中筛选出男生的所有信息,无reduce阶段,因为无需计算
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo5 { public static class map extends Mapper<LongWritable,Text,Text,NullWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String s = value.toString().split(",")[3]; if (s.equals("男")){ context.write(value,NullWritable.get()); } } } public static void main(String[] args) throws Exception { Job job = Job.getInstance(); job.setJobName("students中只筛选出男生,无reduce操作"); job.setJarByClass(Demo5.class); job.setMapperClass(map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NullWritable.class); Path in = new Path("/data/students.txt"); FileInputFormat.addInputPath(job,in); Path out = new Path("/output5"); FileSystem fs = FileSystem.get(new Configuration()); if (fs.exists(out)){ fs.delete(out,true); } FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true); } }
对两张表进行拼接操作:
import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; import java.util.ArrayList; public class Demo6 { public static class map extends Mapper<LongWritable,Text,Text,Text>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //context获取切片,上面是hdfs就从hdfs,下面是reduce //获取路径InputSplit InputSplit is = context.getInputSplit(); //InputSplit获取切片,然后从hdfs中获取文件名或者路径 FileSplit fileSplit= (FileSplit) is; //InputSplit是抽象类,不能使用自己的方法,所以用FileSplit来实现 String s = fileSplit.getPath().toString(); //获取切片的文件路径,是path不是name if (s.contains("students")){ //打上标签 String s1 = "*"+value.toString(); String id = value.toString().split(",")[0]; context.write(new Text(id),new Text(s1)); }else { String s1 = "#"+value.toString(); String id = value.toString().split(",")[0]; context.write(new Text(id),new Text(s1)); } } } public static class reduce extends Reducer<Text,Text,Text,NullWritable> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { //此时进行了一个reducetask任务,key是学号,而values是相同key所对应的所有的数据,包括学生信息和分数信息, //此时里面有七个,六个是score信息,对其进行集合存储 String st=""; ArrayList<String> sc = new ArrayList<String>(); //分数弄成一个集合是因为一个学生对应六个分数,可以通过对集合的遍历将六个成绩逐一算到学生中去 for (Text value : values) { String s = value.toString(); if (s.startsWith("*")){ st = s.substring(1); //此时注意s是包含标签的,记得索引0是标签 }else { sc.add(s.substring(1)); } } //两张表进行拼接 for (String s : sc) { String s1 = s.split(",")[2]; String end=st+","+s1; context.write(new Text(end),NullWritable.get()); } } } public static void main(String[] args) throws Exception{ Job job = Job.getInstance(); job.setJobName("两个文件进行拼接"); job.setJarByClass(Demo6.class); job.setMapperClass(map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path in = new Path("/datajava"); FileInputFormat.addInputPath(job,in); Path out = new Path("/output6"); FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true); System.out.println("可以了第六次"); } }
combine对数据进行性别进行计数
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class Demo8 { public static class map extends Mapper<LongWritable,Text,Text,LongWritable>{ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String sex = value.toString().split(",")[3]; context.write(new Text(sex),new LongWritable(1)); } } public static class combine extends Reducer<Text,LongWritable,Text,LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum=0l; for (LongWritable value : values) { sum+=value.get(); } context.write(key,new LongWritable(sum)); } } public static class reduce extends Reducer<Text,LongWritable,Text,LongWritable>{ @Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum=0l; for (LongWritable value : values) { sum+=value.get(); } context.write(key,new LongWritable(sum)); } } public static void main(String[] args) throws Exception{ Job job = Job.getInstance(); job.setJobName("combine对性别进行计数"); job.setJarByClass(Demo8.class); job.setMapperClass(map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setCombinerClass(combine.class); job.setReducerClass(reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); Path in = new Path("/data/students.txt"); FileInputFormat.addInputPath(job,in); Path out = new Path("/output8"); FileSystem fs = FileSystem.get(new Configuration()); if (fs.exists(out)){ fs.delete(out,true); } FileOutputFormat.setOutputPath(job,out); job.waitForCompletion(true); System.out.println("你又可以了"); } }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· AI与.NET技术实操系列:向量存储与相似性搜索在 .NET 中的实现
· 基于Microsoft.Extensions.AI核心库实现RAG应用
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 10年+ .NET Coder 心语 ── 封装的思维:从隐藏、稳定开始理解其本质意义
· 【设计模式】告别冗长if-else语句:使用策略模式优化代码结构