hdfs小文件分析
导出namenode的元数据文件,并将数据转成csv格式,逗号分割字段
hdfs dfsadmin -fetchImage ./ # 将文件拉到本地
hdfs oiv -i fsimage_0000000000243832876 -o fsimage.csv -p Delimited -delimiter "," -Xmx30720m # 使用hdfs工具本地解析文件,我的镜像是30G,我就用了30的堆内存解析
# 创建hive表
CREATE TABLE temp_dev_db.fsimage_info_csv(
path string,
replication int,
modificationtime string,
accesstime string,
preferredblocksize bigint,
blockscount int,
filesize bigint,
nsquota string,
dsquota string,
permission string,
username string,
groupname string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
# 将解析的文件导入到hive的目录下
hdfs dfs -put fsimage.csv hdfs://ns/xxxxxx
# 统计小于10MB的文件个数,根据路径分组
select concat('/',split(path,'/')[1], '/',split(path,'/')[2], '/',split(path,'/')[3], '/',split(path,'/')[4], '/', split(path,'/')[5]) AS dir_path,count(1) as small_file_num from temp_dev_db.fsimage_info_csv |
# 将结果导出本地开始治理小文件问题