hdfs小文件分析

导出namenode的元数据文件,并将数据转成csv格式,逗号分割字段

hdfs dfsadmin -fetchImage  ./ # 将文件拉到本地

hdfs oiv -i fsimage_0000000000243832876 -o fsimage.csv -p Delimited  -delimiter ","  -Xmx30720m  # 使用hdfs工具本地解析文件,我的镜像是30G,我就用了30的堆内存解析

# 创建hive表

CREATE TABLE temp_dev_db.fsimage_info_csv(
path string,
replication int,
modificationtime string,
accesstime string,
preferredblocksize bigint,
blockscount int,
filesize bigint,
nsquota string,
dsquota string,
permission string,
username string,
groupname string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

# 将解析的文件导入到hive的目录下

hdfs dfs -put fsimage.csv hdfs://ns/xxxxxx

# 统计小于10MB的文件个数,根据路径分组

select concat('/',split(path,'/')[1], '/',split(path,'/')[2], '/',split(path,'/')[3], '/',split(path,'/')[4], '/', split(path,'/')[5]) AS dir_path,count(1) as small_file_num from temp_dev_db.fsimage_info_csv
where path like '/apps/dcp/hive%' and dsquota != -1 and filesize < 1024*1024*10
group by concat('/',split(path,'/')[1], '/',split(path,'/')[2], '/',split(path,'/')[3], '/',split(path,'/')[4], '/', split(path,'/')[5])
order by count(1) desc
limit 100;

# 将结果导出本地开始治理小文件问题

 

posted @ 2024-10-10 17:40  dididi崩了  阅读(3)  评论(0编辑  收藏  举报