内核态实现各个磁盘分区可用内存
问题描述:
在项目中,要在内核层实现系统当前状态下每个可用磁盘分区的空间利用率的计算,目前已经可以遍历得到系统中的每个磁盘分区,格式如下:
sda
sda1
sda2
sda5
sdb
sdb1,
假设上述信息为前提条件X,那么如何根据X获得每个磁盘分区的空间利用率呢?
磁盘分区结构struct gendisk、struct hd_struct只有分区起始扇区,占用扇区数,磁盘读写统计等信息,并没有分区使用率信息,该信息只有当分区挂载到系统某一目录下并产生出
struct super_block超级块结构后,有super_operations->statfs函数来计算,比如对于ext2文件系统,该函数为ext2_statfs,所以在通过mount /dev/sdb1 /media/Kingston
将硬件设备挂载到系统前,无法得到分区使用率信息,由于fdisk -l命令根据hd_struct信息返回数据,所以当插入usb即可感知[注:插入usb后,会将该设备对应的struct device信息插入到block_classes块设备类链表中];但是df 命令读取的是/etc/mtab信息,而该文件描述的是当前挂载到系统内部的设备信息,所以只有当调用mount后,df、/etc/mtab才能显示
出新设备信息,但是由于引入了命名空间,导致proc系统中,每进程都有自己的mounts文件,该文件描述了进程所在的命名空间的挂载的设备信息,所以可以利用/proc/self/mounts文件的信息生成方式来实现从X->各个分区使用率计算,思路如下:
1首先将/proc/self/mounts中的路径信息存为二维数组[真实设备路径,挂载位置]
由于通过/proc/self/mounts获得的设备路径可能为真实设备的符号链接,比如设备路径为/dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd,但是其内容如下
root@ubuntu:/proc/self# ls -al /dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd
lrwxrwxrwx 1 root root 10 2012-04-11 18:10 /dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd -> http://www.cnblogs.com/sda1
所以对于设备名称需要,转化为真实路径。
2 根据前提条件X,确定每个分区在系统中的挂载位置路径,知道了挂载位置路径,就能确定该分区在系统中对应的super_block结构地址,从而调用super_block->statfs获取内存使用情况
3 在内核态中编写模块实现可用内存的计算。
-----------------------------------------------
下面描述/proc/self/mounts的形成方式,在此基础上抽取出[真实设备路径,挂载位置]二元序列对
1 proc/mounts文件的操作在base.c文件中,具体情况为:
static const struct file_operations proc_mounts_operations = {
.open = mounts_open,
.read = seq_read,
.llseek = seq_lseek,
.release = mounts_release,
.poll = mounts_poll,
};
mounts_open函数主要用于填写struct proc_mounts*结构,该结构定义如下:
struct proc_mounts {
struct seq_file m; /* must be the first element */
struct mnt_namespace *ns;
struct path root;
int event;
};
函数目的:
首先获得并填写struct proc_mounts*结构体[mounts_open_common函数中],实现思路如下:
1 首先根据proc_inode获得进程的struct pid,然后获得当前进程struct task_struct
2 获得该进程的struct nsproxy结构,该结构包含了进程所属的所有命名空间信息
3 获得mnt命名空间指针,注:当前共有uts、ipc、mnt、pid、user共5个命名空间
4 获得进程所在根目录的struct path结构,fs_struct->root确定
5 填写该proc_mounts结构体,并且和struct seq_file建立关联,即seq_file->private=proc_mounts
函数代码:
static int mounts_open_common(struct inode *inode, struct file *file,const struct seq_operations *op)
{
struct task_struct *task = get_proc_task(inode);
struct nsproxy *nsp;
struct mnt_namespace *ns = NULL;
struct path root;
struct proc_mounts *p;
int ret = -EINVAL;
if (task)
{
rcu_read_lock();
nsp = task_nsproxy(task);
if (nsp)
{
ns = nsp->mnt_ns;
if (ns)
get_mnt_ns(ns);
}
rcu_read_unlock();
if (ns && get_fs_path(task, &root, 1) == 0)
ret = 0;
put_task_struct(task);
}
if (!ns)
goto err;
if (ret)
goto err_put_ns;
ret = -ENOMEM;
p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
if (!p)
goto err_put_path;
file->private_data = &p->m;
ret = seq_open(file, op);
if (ret)
goto err_free;
p->m.private = p;
p->ns = ns;
p->root = root;
p->event = ns->event;
return 0;
...
}
2 /proc/mounts读写的函数
在fs/namespace.c文件中
const struct seq_operations mounts_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_vfsmnt
};
各个函数实现如下:
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
down_read(&namespace_sem);
return seq_list_start(&p->ns->list, *pos);
//可见该进程命名空间中所有的vfsmount挂载信息都存储在其struct mnt_namespace的list中
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_mounts *p = m->private;
return seq_list_next(v, &p->ns->list, pos);
}
static void m_stop(struct seq_file *m, void *v)
{
up_read(&namespace_sem);
}
其中seq_file操作如下
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
list_for_each(lh, head)
if (pos-- == 0)
return lh;
return NULL;
}
struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos)
{
struct list_head *lh;
lh = ((struct list_head *)v)->next;
++*ppos;
return lh == head ? NULL : lh;
}
对每个挂载信息,最终的显示结果格式如下:
root@ubuntu:/proc# cat /proc/self/mounts
/dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd / ext3 rw,relatime,errors=remount-ro,data=ordered 0 0
显示函数如下:
static int show_vfsmnt(struct seq_file *m, void *v)
{
struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
int err = 0;
struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); //输出设备文件路径,可能为连接文件路径,必须转换为真实路径,用于与X前提条件匹配
seq_putc(m, ' ');
seq_path(m, &mnt_path, " \t\n\\");//输出该设备文件挂载的挂载点路径,这是真实的挂载点路径
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
err = show_sb_opts(m, mnt->mnt_sb);
if (err)
goto out;
show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
out:
return err;
}
-----------------------------------------------
下面是实现过程中的一点参考和积累,仅供分享:
0 关于/etc/mtab,/etc/fstab,/proc/pid/mounts文件区别:
现在的 Linux 系统里一般都有这么三个文件:/etc/fstab,/etc/mtab,和 /proc/mounts,比较容易让人迷惑。简单解释一下。
/etc/fstab 是只读不写的,它提供的是系统上挂载设备的静态信息,比如 mount -a 就会挂载 /etc/fstab 里面指定的文件系统。
/etc/mtab 是供 mount/umount 进行读写的,是相对动态的。读的话,比如你在挂载一个文件系统时缺少一个参数,它就会自动去/etc/mtab 或者 /etc/fstab 里去查,如果找到的话,只要一个参数也够。写的话,比如你umount了一个文件系统,umount 就会删掉/etc/mtab 里面的相关记录。
看似上面的这两个文件已经够用了,但是新的情况出现了。Linux 内核引入了一个 mount namespace,是给container用的。因为这个的出现,Linux 不得不引入 /proc/mounts。为什么呢?因为记录 mount 信息的 /etc/mtab 是全局的,也就是说,就算你的某个进程有自己的 namespace,但只要还和外面共享同一个 /etc/mtab,那么,里面进行umount/mount操作的信息也会被记录到/etc/mtab里,外面也会看到!凌乱了!由此可见,我们不能有全局的mtab,肿么办呢?/proc/mounts 出来了,有人可能觉得它也是全局的啊!可你仔细看一下的话会发现,它其实是到 /proc/self/mounts 的一个符号链接!如此以来,container 里面的 /proc/mounts 和外面的当然就不会一样了!聪明啊!
所以,/etc/mtab 已经过时了,应该被抛弃,或者直接符号链接到/proc/mounts。同理,查看系统上挂载的文件系统的话,直接调用无参数的mount也是不妥的,因为那样也是读 /etc/mtab。我们应该使用 util-linux-ng 提供的一个新命令: findmnt,它是读的 /proc/self/mountinfo。
该部分内容参考文献:http://wangcong.org/blog/archives/1511,作者:王聪
1 用户态实现各个磁盘分区可用内存非常简单
- #include <stdio.h>
- #include <mntent.h>
- #include <string.h>
- #include <sys/vfs.h>
- static const unsigned long long G = 1024*1024*1024ull;
- static const unsigned long long M = 1024*1024;
- static const unsigned long long K = 1024;
- static char str[20];
- char* kscale(unsigned long b, unsigned long bs)
- {
- unsigned long long size = b * (unsigned long long)bs;
- if (size > G)
- {
- sprintf(str, "%0.2f GB", size/(G*1.0));
- return str;
- }
- else if (size > M)
- {
- sprintf(str, "%0.2f MB", size/(1.0*M));
- return str;
- }
- else if (size > K)
- {
- sprintf(str, "%0.2f K", size/(1.0*K));
- return str;
- }
- else
- {
- sprintf(str, "%0.2f B", size*1.0);
- return str;
- }
- }
- int main(int argc, char *argv[])
- {
- FILE* mount_table;
- struct mntent *mount_entry;
- struct statfs s;
- unsigned long blocks_used;
- unsigned blocks_percent_used;
- const char *disp_units_hdr = NULL;
- mount_table = NULL;
- mount_table = setmntent("/etc/mtab", "r");
- if (!mount_table)
- {
- fprintf(stderr, "set mount entry error/n");
- return -1;
- }
- disp_units_hdr = " Size";
- printf("Filesystem %-15sUsed Available %s Mounted on/n",
- disp_units_hdr, "Use%");
- while (1) {
- const char *device;
- const char *mount_point;
- if (mount_table) {
- mount_entry = getmntent(mount_table);
- if (!mount_entry) {
- endmntent(mount_table);
- break;
- }
- }
- else
- continue;
- device = mount_entry->mnt_fsname;
- mount_point = mount_entry->mnt_dir;
- //fprintf(stderr, "mount info: device=%s mountpoint=%s/n", device, mount_point);
- if (statfs(mount_point, &s) != 0)
- {
- fprintf(stderr, "statfs failed!/n");
- continue;
- }
- if ((s.f_blocks > 0) || !mount_table )
- {
- blocks_used = s.f_blocks - s.f_bfree;
- blocks_percent_used = 0;
- if (blocks_used + s.f_bavail)
- {
- blocks_percent_used = (blocks_used * 100ULL
- + (blocks_used + s.f_bavail)/2
- ) / (blocks_used + s.f_bavail);
- }
- /* GNU coreutils 6.10 skips certain mounts, try to be compatible. */
- if (strcmp(device, "rootfs") == 0)
- continue;
- if (printf("/n%-20s" + 1, device) > 20)
- printf("/n%-20s", "");
- char s1[20];
- char s2[20];
- char s3[20];
- strcpy(s1, kscale(s.f_blocks, s.f_bsize));
- strcpy(s2, kscale(s.f_blocks - s.f_bfree, s.f_bsize));
- strcpy(s3, kscale(s.f_bavail, s.f_bsize));
- printf(" %9s %9s %9s %3u%% %s/n",
- s1,
- s2,
- s3,
- blocks_percent_used, mount_point);
- }
- }
- return 0;
- }
上述代码源于:http://blog.csdn.net/fjb2080/article/details/5990355,作者:非空静渡
实现效果如下
/tmp/tmp$ ./a.out
Filesystem Size Used Available Use% Mounted on
/dev/sda7 9.39 GB 6.45 GB 2.46 GB 72% /
none 1.59 GB 300.00 K 1.59 GB 0% /dev
none 1.60 GB 1020.00 K 1.59 GB 0% /dev/shm
none 1.60 GB 296.00 K 1.59 GB 0% /var/run
none 1.60 GB 0.00 B 1.60 GB 0% /var/lock
none 1.60 GB 0.00 B 1.60 GB 0% /lib/init/rw
/dev/sda6 112.62 GB 86.67 GB 25.95 GB 77% /media/work_
/dev/sda9 25.38 GB 22.10 GB 1.99 GB 92% /home
2 根据软连接文件获得真实文件的文件名
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/fs.h> // BDEVNAME_SIZE 32
static int __init in(void)
{
struct kstatfs statfs;
mm_segment_t oldFs;
struct file* fp;
oldFs = get_fs();
set_fs(get_ds());
fp = filp_open("/dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd",O_RDONLY,0444);
if (!fp)
{
printk("device is not existed!\n");
return 0;
}
printk("name:%s,nlink:%d\n",fp->f_dentry->d_name.name,fp->f_dentry->d_inode->i_nlink);
set_fs(oldFs);
return 0;
}
static void __exit out(void)
{
;
}
module_init(in);
module_exit(out);
实现效果如下:
[13233.222709] name:sda1,nlink:1
根据软连接的特性,当通过filp_open打开软连接文件时,最终打开的是原文件,所以文件名为sda1,另外通过验证发现:
inode->nlink表示的是该文件硬链接的连接数目,对于软连接为1,如果有硬链接,计数递增;软连接由于在连接文件中保存了原始文件的绝对路径,所以可以跨域文件系统存在,也可以和目录连接,但是硬链接中两个文件都指向相同的存储同一内容的inode结构,由于inode局部于文件系统,所以不能跨文件系统,而且只能文件到文件,
关于软连接和硬链接,详细内容参考:http://blog.csdn.net/xiajun07061225/article/details/7163249,作者:江南烟雨
3 根据磁盘分区挂载点路径确定该区分使用率的内核模块
#include <linux/init.h>
#include <linux/mount.h>
#include <linux/unistd.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/fs.h> // BDEVNAME_SIZE 32
#include <linux/statfs.h>
#include <linux/err.h>
static int __init in(void)
{
struct kstatfs statfs;
mm_segment_t oldFs;
struct super_block* sb;
struct vfsmount *mnt;
struct file* fp;
oldFs = get_fs();
set_fs(get_ds());
memset(&statfs,0,sizeof(statfs));
fp = filp_open("/media/Kingston",O_RDONLY,0444);
if (IS_ERROR_VALUE(unsigned long fp)) //when error occured,fp will be [-1 , -MAX_ERROR],when turned int unsigned long,will be [ (unsigned long) - MAX_ERROR,(unsigned long) ]
{
printk("device is not existed!\n");
return 0;
}
mnt = fp->f_vfsmnt;
sb= mnt->mnt_sb;
sb->s_op->statfs(sb->s_root,&statfs);
printk("bsize:%lu,blocks:%llu,bfree:%llu,bavail:%llu,name:%s\n",statfs.f_bsize,statfs.f_blocks,statfs.f_bfree,statfs.f_bavail,mnt->mnt_devname);
file_close(fp,NULL);
set_fs(oldFs);
return 0;
}
static void __exit out(void)
{
;
}
module_init(in);
module_exit(out);
/*
when we add a 4G usb ,and mount it at mountpoint /media/Kingston,dmesg -c as follows:
[ 7965.941656] bsize:4096,blocks:990060,bfree:254523,bavail:254523,name:/dev/sdb1
when we get the father mountpoint info,infomation like this: the /dev/sda1 is 20G and mounted at /
[ 7814.157607] bsize:4096,blocks:4933597,bfree:4325228,bavail:4074615,name:/dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd
*/
4 整体实现
#include <linux/init.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/nsproxy.h> //struct nsproxy
#include <linux/mnt_namespace.h> // struct mnt_namespace,struct proc_mounts
#include <linux/path.h> //struct path
#include <linux/mount.h> //struct vfsmount
#include <linux/uaccess.h> //set_fs ,get_fs and so on
#include <linux/unistd.h>
#include <linux/statfs.h> //struct kstatfs
#include <linux/fs.h> //filp_open
#include <linux/path.h> //strut path,path_get,path_put
#define MAX_PATH_LEN 120
static int prepend(char** bufEnd,int* bufLen,const char* name,int nameLen)
{
*bufLen -= nameLen;
if(*bufLen < 0)
return 0;
*bufEnd -= nameLen;
memcpy(*bufEnd,name,nameLen);
return 0;
}
/*
@buffer:from where we copy char* in
@buflen:current free buffer length
copy from the end to the beginning
*/
static char* get_mountPointPath(const struct path *path, struct path *root,
char *buffer, int buflen)
{
struct dentry* dentry = path->dentry;
struct vfsmount* vfsmnt = path->mnt;
char* end = buffer + buflen;
char* retval;
struct dentry* parent;
prepend(&end, &buflen, "\0", 1);
retval = end - 1;
*retval = '/';
for(;;)
{
//printk("[name-----------:dentry:%s,new fs root:%s,father fs last dir:%s]\n",dentry->d_name.name,vfsmnt->mnt_root->d_name.name,vfsmnt->mnt_mountpoint->d_name.name);
if(dentry == root->dentry && vfsmnt == root->mnt) //at /
break;
if(dentry == vfsmnt->mnt_root || dentry == dentry->d_parent) //at father mountpoint
{
if(vfsmnt->mnt_parent == vfsmnt) //global_root
{
retval +=1;
prepend(&end,&buflen,dentry->d_name.name,dentry->d_name.len);
root->mnt = vfsmnt;
root->dentry = dentry;
break;
}
dentry = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent;
continue;
}
parent = dentry->d_parent;
prepend(&end, &buflen, dentry->d_name.name,dentry->d_name.len);
prepend(&end, &buflen, "/",1);
retval = end;
dentry = parent;
}
return retval;
}
static char pairArray[MAX_PATH_LEN*10];
/*
all the mounted partition vfsmount info are listed at list struct mnt_namespace->list
based on /proc/self/mounts,mainly function: mounts_open_common
*/
void getNamePair(void)
{
struct nsproxy* nsp;
struct mnt_namespace* ns;
struct list_head* head;
struct list_head* iter;
struct vfsmount* mnt;
char deviceFilePath[MAX_PATH_LEN];
char mountPointPath[MAX_PATH_LEN];
char pair[2*MAX_PATH_LEN];
mm_segment_t oldFs;
struct file* fp;
struct path root;
struct path tmp;
char * src,*dst;
nsp = current->nsproxy;
ns = nsp->mnt_ns;
oldFs = get_fs();
set_fs(get_ds());
head = &ns->list;
memset(pairArray,0,sizeof(pairArray));
for (iter=head->next;iter != head; iter=iter->next)
{
mnt = list_entry(iter,struct vfsmount,mnt_list);
if( !mnt || !mnt->mnt_devname)
continue;
//1: get true name
snprintf(deviceFilePath,MAX_PATH_LEN,"%s",mnt->mnt_devname);//may be link file name
fp = filp_open(deviceFilePath,O_RDONLY,0444);
if(IS_ERR_VALUE((unsigned long)fp))
continue;
snprintf(deviceFilePath,MAX_PATH_LEN,"%s",fp->f_dentry->d_name.name);//must be true file name
filp_close(fp,NULL);
//2: get mountpoint position
root = current->fs->root;
tmp.dentry = mnt->mnt_root;
tmp.mnt = mnt;
path_get(&root);
dst = get_mountPointPath(&tmp,&root,mountPointPath,MAX_PATH_LEN-1);
src = mountPointPath;
while(*dst)
*src++ = *dst++;
*src = '\0';
path_put(&root);
//generate on pair
snprintf(pair,sizeof(pair),"<%s,%s>",deviceFilePath,mountPointPath);
strcat(pairArray,pair);
}
set_fs(oldFs);
printk("total pairArray:%s\n",pairArray);
}
static int __init in(void)
{
char* mountPointPtr;
char* mountPointEnd;
char name[MAX_PATH_LEN];
struct kstatfs statfs;
mm_segment_t oldFs;
struct super_block* sb;
struct vfsmount* mnt;
struct file* fp;
getNamePair();
oldFs = get_fs();
set_fs(get_ds());
snprintf(name,MAX_PATH_LEN,"%s","sdb1");
if( (mountPointPtr = strstr(pairArray,name)) )
{
mountPointPtr += strlen(name) + 1;
mountPointEnd = strstr(mountPointPtr,">") -1;
memcpy(name,mountPointPtr,mountPointEnd-mountPointPtr);
name[mountPointEnd-mountPointPtr] = '\0';
fp = filp_open(name,O_RDONLY,0444);
if(IS_ERROR_VALUE(unsigned long fp))
{
return 0;
}
mnt = fp->f_vfsmnt;
sb = mnt->mnt_sb;
sb->s_op->statfs(sb->s_root,&statfs);
printk("bsize:%lu,blocks:%llu,bfree:%llu,bavail:%llu,name:%s\n",statfs.f_bsize,statfs.f_blocks,statfs.f_bfree,statfs.f_bavail,mnt->mnt_devname);
filp_close(fp,NULL);
}
snprintf(name,MAX_PATH_LEN,"%s","sda1");
if( (mountPointPtr = strstr(pairArray,name)) )
{
mountPointPtr += strlen(name) + 1;
mountPointEnd = strstr(mountPointPtr,">") -1;
memcpy(name,mountPointPtr,mountPointEnd-mountPointPtr);
name[mountPointEnd-mountPointPtr] = '\0';
fp = filp_open(name,O_RDONLY,0444);
if(IS_ERROR_VALUE(unsigned long fp))
{
return 0;
}
mnt = fp->f_vfsmnt;
sb = mnt->mnt_sb;
sb->s_op->statfs(sb->s_root,&statfs);
printk("bsize:%lu,blocks:%llu,bfree:%llu,bavail:%llu,name:%s\n",statfs.f_bsize,statfs.f_blocks,statfs.f_bfree,statfs.f_bavail,mnt->mnt_devname);
filp_close(fp,NULL);
}
return 0;
}
static void __exit out(void)
{
;
}
module_init(in);
module_exit(out);
结果输出:
[ 7965.941656] bsize:4096,blocks:990060,bfree:254523,bavail:254523,name:/dev/sdb1
[ 7814.157607] bsize:4096,blocks:4933597,bfree:4325228,bavail:4074615,name:/dev/disk/by-uuid/8376060c-3840-4df2-9f2e-91a07bc5c2dd