WordCount案例实操
WordCount案例实操
java代码
WordCountMapper类
package com.guodaxia.mapreduce.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> { //1. 定义 k - v (文本 - 数量) Text k = new Text(); IntWritable v = new IntWritable(1);//必须初始为1 //2,重写map方法,业务代码 @Override protected void map(LongWritable key,Text value ,Context context) throws IOException, InterruptedException { //1.从数据源中获得一行数据 String line = value.toString(); //2.分割该行中的单词 String[] words = line.split(" "); //3.输出 for(String word:words){ k.set(word);//不对v做任何操作 context.write(k,v);//TODO } } }
WordCountReducer类
package com.guodaxia.mapreduce.wordcount; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class WordCountReducer extends Reducer<Text , IntWritable , Text ,IntWritable> { int sumTemp; IntWritable v = new IntWritable(1);//必须初始为1 //业务逻辑代码 @Override protected void reduce(Text key ,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException { //1.合并同一个word的出现次数 sumTemp = 0; for (IntWritable count:values){ sumTemp += count.get(); } v.set(sumTemp); //2,输出 context.write(key,v); } }
WordCountDriver类
package com.guodaxia.mapreduce.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import java.io.IOException; public class WordCountDriver { public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { // 1.获得配置信息和job对象 Configuration con = new Configuration(); Job job = Job.getInstance(con); // 2.关联Driver程序的jar job.setJarByClass(WordCountDriver.class); // 3.关联Mapper和Reducer程序的jar job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); // 4.设置Mapper输出的k-v类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); // 5.设置最终输出k-v类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 6.设置输入和输出路径 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1])); // 7.提交job boolean result = job.waitForCompletion(true); System.exit(result ? 0 : 1); } }
xml配置文件
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>guodaxia</groupId> <artifactId>MapReduceDemo</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>3.1.3</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.12</version> </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> <version>1.7.30</version> </dependency> <dependency> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-assembly-plugin</artifactId> <version>3.2.0</version> </dependency> </dependencies> <build> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> <executions> <execution> <id>make-assembly</id> <phase>package</phase> <goals> <goal>single</goal> </goals> </execution> </executions> </plugin> </plugins> </build> </project>
resources --> log4j.properties
log4j.rootLogger=INFO, stdout log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n log4j.appender.logfile=org.apache.log4j.FileAppender log4j.appender.logfile.File=target/spring.log log4j.appender.logfile.layout=org.apache.log4j.PatternLayout log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
Maven 打包
- 打开右置Maven --> Lifecycle -->package 等待打包完成
- 复制项目下target的 MapReduceDemo-1.0-SNAPSHOT.jar 到桌面并命名为wc.jar
hadoop集群环节
利用xshell软件将该wc.jar复制到 Hadoop集群下的/opt/module/hadoop-3.1.3下;
启动集群;
在当前路径下创建一个input 文件,并上传到Hadoop集群上;hadoop fs -put ./input input
;
复制javaDriver驱动类的全包路径(打开该wordCountDriver类,在代码里右键);
执行程序(集群上不能有output路径!!!)
hadoop jar wc.jar 驱动包 /input /output
比任何人都要努力
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 分享4款.NET开源、免费、实用的商城系统
· 全程不用写代码,我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了,比商业数据库还牛
· 白话解读 Dapr 1.15:你的「微服务管家」又秀新绝活了
· 上周热点回顾(2.24-3.2)
2022-05-29 从集合复制到文本tip