实验5
MapReduce初级编程实践
1.实验目的
(1)通过实验掌握基本的MapReduce编程方法;
(2)掌握用MapReduce解决一些常见的数据处理问题,包括数据去重、数据排序和数据挖掘等。
2.实验平台
(1)操作系统:Linux(建议Ubuntu16.04或Ubuntu18.04)
(2)Hadoop版本:3.1.3
3.实验步骤
(一)编程实现文件合并和去重操作
对于两个输入文件,即文件A和文件B,请编写MapReduce程序,对两个文件进行合并,并剔除其中重复的内容,得到一个新的输出文件C。下面是输入文件和输出文件的一个样例供参考。
输入文件A的样例如下:
20170101 x 20170102 y 20170103 x 20170104 y 20170105 z 20170106 x |
输入文件B的样例如下:
20170101 y 20170102 y 20170103 x 20170104 z 20170105 y |
根据输入文件A和B合并得到的输出文件C的样例如下:
20170101 x 20170101 y 20170102 y 20170103 x 20170104 y 20170104 z 20170105 y 20170105 z 20170106 x |
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <version>3.8.1</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>2.8.5</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-hdfs</artifactId>
- <version>2.8.5</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-core -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-mapreduce-client-core</artifactId>
- <version>2.8.5</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.8.5</version>
- </dependency>
- <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-yarn-api -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-yarn-api</artifactId>
- <version>2.8.5</version>
- </dependency>
- <!--ssh2架包-->
- <dependency>
- <groupId>ch.ethz.ganymed</groupId>
- <artifactId>ganymed-ssh2</artifactId>
- <version>build210</version>
- </dependency>
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- <version>1.2.17</version>
- </dependency>
- <dependency>
- <groupId>org.apache.hbase</groupId>
- <artifactId>hbase-client</artifactId>
- <version>2.1.0</version>
- </dependency>
- import java.io.IOException;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class Merge {
- //重载map函数,直接将输入中的value复制到输出数据的key上
- public static class Map extends Mapper<Object, Text, Text, Text>{
- private static Text text = new Text();
- public void map(Object key, Text value, Context context) throws IOException,InterruptedException{
- text = value;
- context.write(text, new Text(""));
- }
- }
- //重载reduce函数,直接将输入中的key复制到输出数据的key上
- public static class Reduce extends Reducer<Text, Text, Text, Text>{
- public void reduce(Text key, Iterable<Text> values, Context context ) throws IOException,InterruptedException{
- context.write(key, new Text(""));
- }
- }
- public static void main(String[] args) throws Exception{
- Configuration conf = new Configuration();
- conf.set("fs.defaultFS","hdfs://node1:8020");
- String[] otherArgs = new String[]{"/input","/output"}; /* 直接设置输入参数 */
- Job job = Job.getInstance(conf,"Merge and duplicate removal");
- job.setJarByClass(Merge.class);
- job.setMapperClass(Map.class);
- job.setCombinerClass(Reduce.class);
- job.setReducerClass(Reduce.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(Text.class);
- FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
- FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
- System.exit(job.waitForCompletion(true) ? 0 : 1);
- }
- }
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 25岁的心里话
· 闲置电脑爆改个人服务器(超详细) #公网映射 #Vmware虚拟网络编辑器
· 基于 Docker 搭建 FRP 内网穿透开源项目(很简单哒)
· 零经验选手,Compose 一天开发一款小游戏!
· 一起来玩mcp_server_sqlite,让AI帮你做增删改查!!