Hadoop第四天
1. 作业题
统计每个手机号消耗的流量: 上传流量 下载流量 总流量
1.1. 封装序列化对象
package com.hotdas.mr.bean;
import org.apache.hadoop.io.Writable;
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException;
public class FlowBean implements Writable {
//定义属性 private long upflow; private long downflow; private long sumflow;
public FlowBean() { }
public FlowBean(long upflow, long downflow) { this.upflow = upflow; this.downflow = downflow; this.sumflow = upflow+downflow; }
//序列化 @Override public void write(DataOutput out) throws IOException { out.writeLong(upflow); out.writeLong(downflow); out.writeLong(sumflow); }
//反序化 @Override public void readFields(DataInput in) throws IOException { this.upflow=in.readLong(); this.downflow=in.readLong(); this.sumflow=in.readLong(); }
public long getUpflow() { return upflow; }
public void setUpflow(long upflow) { this.upflow = upflow; }
public long getDownflow() { return downflow; }
public void setDownflow(long downflow) { this.downflow = downflow; }
public long getSumflow() { return sumflow; }
public void setSumflow(long sumflow) { this.sumflow = sumflow; } }
@Override public String toString() { return upflow + "\t" + downflow +"\t" + sumflow; }
|
1.2. Map的实现
package com.hotdas.mr;
import com.hotdas.mr.bean.FlowBean; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class FlowCountMapper extends Mapper<LongWritable, Text,Text, FlowBean> {
FlowBean flowBean=null;
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //获取一行内容 String line = value.toString(); //切割 String[] split = line.split("\t");
//获取手机号 String phoneNum = split[1];
//上传流量下载流量 long upflow = Long.parseLong(split[split.length - 3]); long downflow = Long.parseLong(split[split.length - 2]);
flowBean=new FlowBean(upflow,downflow);
//写出给reduce context.write(new Text(phoneNum),flowBean);
} }
|
1.3. reduce的实现
package com.hotdas.mr;
import com.hotdas.mr.bean.FlowBean; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class FlowCountReduce extends Reducer<Text, FlowBean,Text, FlowBean> {
@Override protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
long sum_upflow=0; long sum_downflow=0;
//遍历上传和下载流量 ,对所有流量累加 for(FlowBean flowBean : values){ sum_upflow+=flowBean.getUpflow(); sum_downflow+=flowBean.getDownflow(); }
//计算总流量 FlowBean flowBean = new FlowBean(sum_upflow,sum_downflow);
//写出 context.write(key,flowBean);
} }
|
1.4. Driver实现
package com.hotdas.mr;
import com.hotdas.mr.bean.FlowBean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FlowCountDriver {
public static void main(String[] args) throws Exception { // 获取集群信息 Configuration configuration = new Configuration(); // 构造job任务 Job job = Job.getInstance(configuration);
// 设置job类路径 job.setJarByClass(FlowCountDriver.class);
// 设置map和reduce类 job.setMapperClass(FlowCountMapper.class); job.setReducerClass(FlowCountReduce.class);
// 设置map的k,v类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(FlowBean.class);
// 设置reduce的k,v类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class);
// 设置输入与输出路径 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 提交工作 boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
} }
|
1.5. 调试
2. 总结:自定义bean实现序列化的过程
1) 必需实现Writable
2) 重写序列化方法
3) 重写反序列化方法
4) 序列化参数与反序列化参数顺序要一致
3. MapReduce框架原理
3.1. Job提交流程
3.1.1. 源码分析
job.waitForCompletion(true);
//提交
submit();
//建立连接
connect();
//创建提交job的代理
new Cluster(getConfiguration())
//初始化集群--是本地还是远程
initialize(jobTrackAddr, conf);
//通过提交器提交工作
submitter.submitJobInternal(Job.this, cluster);
//创建给集群提交的Stag路径
Path jobStagingArea = JobSubmissionFiles.getStagingDir(cluster, conf);
//获取工作job的唯一ID
JobID jobId = submitClient.getNewJobID();
//生成提交工作的文件目录路径
Path submitJobDir = new Path(jobStagingArea, jobId.toString());
//把jar提交到集群
copyAndConfigureFiles(job, submitJobDir);
//通过上传器上传工作
rUploader.uploadFiles(job, jobSubmitDir);
//统计maptask的任务数
int maps = writeSplits(job, submitJobDir);
//计算切片,生成切片规则文件
maps = writeNewSplits(job, jobSubmitDir);
//获取切片
input.getSplits(job);
//向stag路径下写入工作的真正内容(xml描述信息)
writeConf(conf, submitJobFile);
conf.writeXml(out);
//提交job返回工作状态信息
submitClient.submitJob( jobId, submitJobDir.toString(), job.getCredentials());
|
3.1.2. 流程图析
3.2. MapReduce工作流程
总结:
以上图流程是整个mapreduce的工作流程,其中shuffle过程是7步到16步结果,后面具体详解.
1) maptask收集map()方法产生的k,v键值对,放入内存缓冲地区.
2) 从内存的缓冲区不断的溢写磁盘,可能溢出很多文件
3) 多个溢出的文件最后会合并到一个大的溢出文件.
4) 在溢出的过程中,会对溢出的数据进行合并,调用分区方法,对数据进行分区.
5) reduceTask会根据自已的分区号,去取相应maptask中的分区数据.
6) reduceTask会对数据进行合并,分组,排序,合并成一个大文件后,shuffle结束.
7) reduceTask使用OutputFormat转出结果到目标
问题: 如果有大量的小文件要进行mapreduce运算,会影响效率吗?
小文件的数量决定maptask数量,在启动maptask时,会耗费大量的时间,而且在统计时,分区数越多,并且在计算ReduceTask任务越多.会造成计算机资源大量浪费,影响集群效率.
解决大量小文件统计问题: 在读取数据时,使用合并小文件数据进行读取,减少maptask和reduceTask数量,从而提高效率.
shuffle缓冲区默认为100M,可以通过参数mapreduce.task.io.sort.mb进行修改.
3.3. InputFormat输入
InputFormat有以下多个子类实现:
3.3.1. FileInputFormat
FileInputFormat默认的读取数据机制:
(1) 按照简单的文件长度进行切片.
(2) 切片的大小,默认Block块的大小.
(3) 切片不会考虑数据的整体性,而是针对每个文件单独切片
FileInputFormat源码分析:
计算切片大小的逻辑:
Math.max(minSize, Math.min(maxSize, blockSize))
当Block是满时,取最大的128M,当Block块不满,取Block大小!
文件的切片数是由哪些值决定的:
mapreduce.input.fileinputformat.split.minsize = 0
mapreduce.input.fileinputformat.split.maxsize = Long.MAX_VALUE(默认值)
3.3.2. CombineFileInputFormat
作用:解决大量小文件的优化问题的.
在Driver程序设置使用的输入流:
job.setInputFormatClass(CombineFileInputFormat.class);
CombineFileInputFormat.setMaxInputSplitSize(job,134217728);//128M
CombineFileInputFormat.setMinInputSplitSize(job,0);//0M
3.3.3. KeyValueTextInputFormat
需求:统计输入文件中每一行的第一个单词相同的行数。
1) 数据如下:
banzhang ni hao
xihuan hadoop banzhang dc
banzhang ni hao
xihuan hadoop banzhang dc
2) 结果:
banzhang 2
xihuan 2
实现:
KVCountMapper:
package com.hotdas.mr;
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class KVCountMapper extends Mapper<Text,Text,Text, LongWritable> {
final Text k = new Text(); final LongWritable v = new LongWritable();
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { //banzhang ni hao
//1.key value String[] split = key.toString().split(" "); String vls = value.toString();
k.set(split[0]); v.set(1); //写出
context.write(k,v); } }
|
KVCountReduce:
package com.hotdas.mr;
import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class KVCountReduce extends Reducer<Text, LongWritable,Text, LongWritable> {
LongWritable v = new LongWritable();
@Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long count =0;
for(LongWritable longWritable : values){ count+=longWritable.get(); }
v.set(count);
context.write(key,v); } }
|
KVCountDriver:
package com.hotdas.mr;
import com.hotdas.mr.bean.FlowBean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueLineRecordReader; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class KVCountDriver {
public static void main(String[] args) throws Exception { // 获取集群信息 Configuration configuration = new Configuration(); // 构造job任务 Job job = Job.getInstance(configuration); //设置切割符 --> banzhang ni hao --> key:banzhang value:ni hao configuration.set(KeyValueLineRecordReader.KEY_VALUE_SEPERATOR,"***");
// 设置job类路径 job.setJarByClass(KVCountDriver.class); // 设置map和reduce类 job.setMapperClass(KVCountMapper.class); job.setReducerClass(KVCountReduce.class);
// 设置map的k,v类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class);
// 设置reduce的k,v类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class);
//设置inputformat格式 job.setInputFormatClass(KeyValueTextInputFormat.class);
// 设置输入与输出路径 FileInputFormat.setInputPaths(job,new Path(args[0])); FileOutputFormat.setOutputPath(job,new Path(args[1]));
// 提交工作 boolean result = job.waitForCompletion(true);
System.exit(result?0:1);
} }
|
3.3.4. NLineInputFormat
需求:根据每个输入文件的行数来规定输出多少个切片。例如每三行放入一个切片中。
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· go语言实现终端里的倒计时
· 如何编写易于单元测试的代码
· 10年+ .NET Coder 心语,封装的思维:从隐藏、稳定开始理解其本质意义
· .NET Core 中如何实现缓存的预热?
· 从 HTTP 原因短语缺失研究 HTTP/2 和 HTTP/3 的设计差异
· 周边上新:园子的第一款马克杯温暖上架
· 分享 3 个 .NET 开源的文件压缩处理库,助力快速实现文件压缩解压功能!
· Ollama——大语言模型本地部署的极速利器
· DeepSeek如何颠覆传统软件测试?测试工程师会被淘汰吗?
· 使用C#创建一个MCP客户端