实验1:HADOOP实验-HDFS与MAPREDUCE操作
一、实验目的
1、利用虚拟机搭建集群部署hadoop
2、HDFS文件操作以及文件接口编程;
3、MAPREDUCE并行程序开发、发布与调用。
二、实验内容
一.虚拟机集群搭建部署hadoop
利用VMware、centOS-7、Xshell(secureCrt)等软件搭建集群部署hadoop,具体操作参照
https://www.bilibili.com/video/BV1Kf4y1z7Nw?p=1
二、HDFS文件操作
在分布式文件系统上验证HDFS文件命令,如下。
hadoop fs [genericOpitions]
[-ls <path>] //显示目标路径当前目录下的所有文件
[-lsr <path>] //递归显示目标路径下的所有目录及文件(深度优先)
[-du <path>] //以字节为单位显示目录中所有文件的大小,或该文件的大小(如果path为文件)
[-dus <paht>] //以字节为单位显示目标文件大小(用于查看文件夹大小)
[-count [-q] <path>] //将目录的大小、包含文件(包括文件)个数的信息输出到屏幕(标准stdout)
[-mv <src> <dst>] //把文件或目录移动到目标路径,这个命令允许同时移动多个文件,但是只允许移动到一个目标路径中,参数中的最有一个文件夹即为目标路径
[-cp <src> <dst>] //复制文件或目录到目标路径,这个命令允许同时复制多个文件,如果复制多个文件,目标路径必须是文件夹
[-rm [-skipTrash] <path>] //删除文件,这个命令不能删除文件夹
[-rmr [-skipTrash] <path>] //删除文件夹及其下的所有文件
[-expunge]
[-put <localsrc> ... <dst>] //从本地文件系统上传文件到HDFS中
[-copyFromLocal <localsrc> ... <dst>] //与put相同
[-moveFromLocal <localsrc> ... <dst>] //与put相同,但是文件上传之后会从本地文件系统中移除
[-get [-ignoreCrc] [-crc] <src> <localdst>] //复制文件到本地文件系统。这个命令可以选择是否忽视校验和,忽视校验和和下载主要用于挽救那些已经发生错误的文件
[-getmerge <src> <localdst> [addnl]] //将源目录中的所有文件进行排序并写入目标文件中,文件之间以换行符分隔
[-cat <src>] //在终端显示(标准输出stdout)文件中的内容,类似Linux系统中的cat
[-text <src>]
[-copyToLocal [-ignoreCrc] [-crc] <src> <localdst>] //与get相同
[-moveToLocal [-crc] <src> <localdst>]
[-mkidr <path>] //创建文件夹
[-setrep [-R] [-w] <rep> <path/file>] //改变一个文件的副本个数。参数-R可以递归地对该目录下的所有文件做统一操作
[-touchz <path>] //类似Linux中的touch,创建一个空文件
[-test -[ezd] <path>] //将源文件输出为文本格式显示到终端上,通过这个命令可以查看TextRecordInputStream(SequenceFile等)或zip文件
[-stat [format] <path>] //以指定格式返回路径的信息
[-tail [-f] <file>] //在终端上显示(标准输出stdout)文件的最后1kb内容。-f选项的行为与LInux中一致,会持续监测先添加到文件中的内容,这在查看日志文件时会显得非常方便。
[-chmod [-R] <MODE[,MODE]...| OCTALMODE> PATH...] //改变文件的权限,只有文件的所有者或者是超级用户才能使用这个命令。-R可以递归地改变文件夹内的所有文件的权限
[-chown [-R] [OWNER] [:[GROUP] PATH...]] //改变文件的拥有者,-R可以递归地改变文件夹内所有文件的拥有者。同样,这个命令只有超级用户才能使用
[-chgrp [-R] GROUP PATH...] //改变文件所属的组,-R可以递归地改变文件夹内所有文件所属的组。这个命令必须是超级用户才能使用
[-help [cmd]] //这是命令的帮助信息
2.1 HDFS接口编程
调用HDFS文件接口实现对分布式文件系统中文件的访问,如创建、修改、删除等。
参考代码:
package mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.junit.After;
import org.junit.Before;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Arrays;
public class HdfsClient {
private FileSystem fs;
@After
public void close() throws IOException {
//关闭资源
fs.close();
}
@Test
public void testMkdir() throws URISyntaxException, IOException, InterruptedException {
//连接的集群地址
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
//创建文件
fs.mkdirs(new Path("/std/wmd"));
System.out.println("创建成功");
}
//上传
@Test
public void testPut() throws IOException, URISyntaxException, InterruptedException {
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
//表示删除原数据,表示是否允许覆盖
fs.copyFromLocalFile(false,false,new Path("E:\\input.txt"),new Path("/wmd/input.txt"));
System.out.println("上传成功");
}
//文件下载
@Test
public void testGet() throws IOException, URISyntaxException, InterruptedException {
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
fs.copyToLocalFile(false,new Path("hdfs://node01/wmd/input.txt"),new Path("D:\\"),true);
System.out.println("下载成功");
}
//文件删除
@Test
public void testRm() throws IOException, URISyntaxException, InterruptedException {
//删除文件
//参数解读:是否递归删除
//fs.delete(new Path("文件名"),false);
//删除非空目录
//fs.delete("",true);
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
fs.delete(new Path("hdfs://node01/std"),true);
System.out.println("删除成功");
}
//文件的更名和移动
@Test
public void testMv() throws IOException, URISyntaxException, InterruptedException {
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
//同目录下进行更名
fs.rename(new Path("/wmd/wmdym.txt"),new Path("/wmd.txt"));
System.out.println("移动成功");
//目录更名
//fs.rename(new Path("/tiansui"),new Path("/dym"));
}
//获取文件详细信息
@Test
public void fileDetail() throws IOException, URISyntaxException, InterruptedException {
URI uri = new URI("hdfs://node01:8020");
//配置文件
Configuration configuration = new Configuration();
//用户
String user="hadoop";
fs = FileSystem.get(uri,configuration,user);
//获取文件所有信息
RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
//遍历文件
while (listFiles.hasNext()) {
//本地文件状态
LocatedFileStatus fileStatus = listFiles.next();
System.out.println("============="+fileStatus.getPath()+"==============");
System.out.println(fileStatus.getLen());
System.out.println(fileStatus.getPermission());
System.out.println(fileStatus.getOwner());
System.out.println(fileStatus.getGroup());
System.out.println(fileStatus.getModificationTime());
System.out.println(fileStatus.getReplication());
System.out.println(fileStatus.getBlockSize());
System.out.println(fileStatus.getPath().getName());
BlockLocation[] blockLocations = fileStatus.getBlockLocations();
System.out.println(Arrays.toString(blockLocations));
}
}
}
三、MAPREDUCE并行程序开发
3.1 求每年最高气温
原始数据如下:
2014010114
2014010216
2014010317
2014010410
2014010506
2012010609
2012010732
2012010812
2012010919
2012011023
2001010116
2001010212
2001010310
2001010411
2001010529
2013010619
2013010722
2013010812
2013010929
2013011023
2008010105
2008010216
2008010337
2008010414
2008010516
2007010619
2007010712
2007010812
2007010999
2007011023
2010010114
2010010216
2010010317
2010010410
2010010506
2015010649
2015010722
2015010812
2015010999
2015011023
参考代码
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Temperature {
/**
* 四个泛型类型分别代表:
* KeyIn Mapper的输入数据的Key,这里是每行文字的起始位置(0,11,...)
* ValueIn Mapper的输入数据的Value,这里是每行文字
* KeyOut Mapper的输出数据的Key,这里是每行文字中的“年份”
* ValueOut Mapper的输出数据的Value,这里是每行文字中的“气温”
*/
static class TempMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 打印样本: Before Mapper: 0, 2000010115
System.out.print("Before Mapper: " + key + ", " + value);
String line = value.toString();
String year = line.substring(0, 4);
int temperature = Integer.parseInt(line.substring(8));
context.write(new Text(year), new IntWritable(temperature));
// 打印样本: After Mapper:2000, 15
System.out.println(
"======" +
"After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
}
}
/**
* 四个泛型类型分别代表:
* KeyIn Reducer的输入数据的Key,这里是每行文字中的“年份”
* ValueIn Reducer的输入数据的Value,这里是每行文字中的“气温”
* KeyOut Reducer的输出数据的Key,这里是不重复的“年份”
* ValueOut Reducer的输出数据的Value,这里是这一年中的“最高气温”
static class TempReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
StringBuffer sb = new StringBuffer();
//取values的最大值
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
sb.append(value).append(", ");
}
// 打印样本: Before Reduce: 2000, 15, 23, 99, 12, 22,
System.out.print("Before Reduce: " + key + ", " + sb.toString());
context.write(key, new IntWritable(maxValue));
// 打印样本: After Reduce: 2000, 99
System.out.println(
"======" +
"After Reduce: " + key + ", " + maxValue);
}
}
public static void main(String[] args) throws Exception {
//输入路径
String dst = "hdfs://localhost:9000/intput.txt";
//输出路径,必须是不存在的,空文件加也不行。
String dstOut = "hdfs://localhost:9000/output";
Configuration hadoopConfig = new Configuration();
hadoopConfig.set("fs.hdfs.impl",
org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
);
hadoopConfig.set("fs.file.impl",
org.apache.hadoop.fs.LocalFileSystem.class.getName()
);
Job job = new Job(hadoopConfig);
//如果需要打成jar运行,需要下面这句
//job.setJarByClass(NewMaxTemperature.class);
//job执行作业时输入和输出文件的路径
FileInputFormat.addInputPath(job, new Path(dst));
FileOutputFormat.setOutputPath(job, new Path(dstOut));
//指定自定义的Mapper和Reducer作为两个阶段的任务处理类
job.setMapperClass(TempMapper.class);
job.setReducerClass(TempReducer.class);
//设置最后输出结果的Key和Value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//执行job,直到完成
job.waitForCompletion(true);
System.out.println("Finished");
}
}
将程序发布为jar包,并上传到hadoop平台运行。
package mapreduce;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Temperature {
static class TempMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.print("Before Mapper: " + key + ", " + value);
String line = value.toString();
String year = line.substring(0, 4);
int temperature = Integer.parseInt(line.substring(8));
context.write(new Text(year), new IntWritable(temperature));
System.out.println("======" + "After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
}
}
static class TempReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
StringBuffer sb = new StringBuffer();
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
sb.append(value).append(", ");
}
System.out.print("Before Reduce: " + key + ", " + sb.toString());
context.write(key, new IntWritable(maxValue));
System.out.println("======" + "After Reduce: " + key + ", " + maxValue);
}
}
public static void main(String[] args) throws Exception {
String dst = "hdfs://node01:8020/wmd/input.txt";
String dstOut = "hdfs://node01:8020/wmd/output";
Configuration hadoopConfig = new Configuration();
hadoopConfig.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
hadoopConfig.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
Job job = new Job(hadoopConfig);
// job.setJarByClass(NewMaxTemperature.class);
FileInputFormat.addInputPath(job, new Path(dst));
FileOutputFormat.setOutputPath(job, new Path(dstOut));
job.setMapperClass(TempMapper.class);
job.setReducerClass(TempReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
System.out.println("Finished");
}
}
3.2 词频统计
maven建立quick-start工程。
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.edu.bupt.wcy</groupId>
<artifactId>wordcount</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>wordcount</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.1</version>
</dependency>
</dependencies>
</project>
3个java代码,mapper、reducer、runner主类:
mapper:
package cn.edu.bupt.wcy.wordcount;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.map(key, value, context);
//String[] words = StringUtils.split(value.toString());
String[] words = StringUtils.split(value.toString(), " ");
for(String word:words)
{
context.write(new Text(word), new LongWritable(1));
}
}
}
reducer:
package cn.edu.bupt.wcy.wordcount;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text arg0, Iterable<LongWritable> arg1,
Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
//super.reduce(arg0, arg1, arg2);
int sum=0;
for(LongWritable num:arg1)
{
sum += num.get();
}
context.write(arg0,new LongWritable(sum));
}
}
runner:
package cn.edu.bupt.wcy.wordcount;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCountRunner {
public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCountRunner.class);
job.setJobName("wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(args[1]));
FileOutputFormat.setOutputPath(job, new Path(args[2]));
job.waitForCompletion(true);
}
}
打包成jar包后,放到集群上运行。先在集群上新建一个文件夹:
hdfs dfs -mkdir /input_wordcount 再放入单词文件,比如:
hello world
I like playing basketball
hello java。。。
运行hadoop jar WordCount.jar(jar包) WordCountRunner(主类) /input_wordcount /output_wordcount
运行完成后,查看:
hdfs dfs -ls /output_wordcount。已经生成了结果,在cat一下查看内容即可。
源代码:
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
String[] words = StringUtils.split(value.toString(), " ");
for (String word : words) {
context.write(new Text(word), new LongWritable(1));
}
}
}
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text arg0, Iterable<LongWritable> arg1,
Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
int sum = 0;
for (LongWritable num : arg1) {
sum += num.get();
}
context.write(arg0, new LongWritable(sum));
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.log4j.BasicConfigurator;
public class WordCountRunner {
public static void main(String[] args)throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
BasicConfigurator.configure();
Configuration conf = new Configuration();
Job job = new Job(conf);
job.setJarByClass(WordCountRunner.class);
job.setJobName("wordcount");
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("hdfs://node01:8020/user/zzy/input_wordcount.txt"));// 输入路径
FileOutputFormat.setOutputPath(job, new Path("hdfs://node01:8020/user/zzy/output_wordcount"));// 输出路径
job.waitForCompletion(true);
}
}
运行结果: