从根文件系统制作看loop设备
一、引出
在Linux系统下,通过 dd + losetup + mkfs 创建一个根文件系统已经是创建一个Linux根文件系统的一个常规方法。由于这个是通过一个loop设备来创建的文件,所以这个loop文件是把一个文件看做是一个设备,这个还是一个相对比较高难度的一个适配,因为上层肯定是使用了块设备的驱动来完成对一个文件的操作,此事还是有一些挑战性的。
二、实现
1、内核中对loop设备的注册
linux-2.6.21\drivers\block\loop.c
关于loop设备
linux-2.6.21\drivers\block\loop.c:loop_init
if (register_blkdev(LOOP_MAJOR, "loop"))
return -EIO;
#define LOOP_SET_FD 0x4C00
#define LOOP_SET_STATUS64 0x4C04
这里是对一种loop设备的注册,这个设备也就是注册了一种自己的设备号。我们从内核的文档中可以知道,loop有自己的设备号。在内核说明文档中linux-2.6.21\Documentation\devices.txt
7 block Loopback devices 0 = /dev/loop0 First loop device 1 = /dev/loop1 Second loop device ...
The loop devices are used to mount filesystems not
associated with block devices. The binding to the
loop devices is handled by mount(8) or losetup(8).
作为一种虚拟设备,内核将会一次性在内核中分配指定个数的虚拟磁盘,这个参数可以在系统启动的时候通过启动参数max_loop进行设置。然后在loop的初始化函数中创建这样指定个数的磁盘,这些就是我们用户态可以识别的磁盘个数。
loop_dev = kmalloc(max_loop * sizeof(struct loop_device), GFP_KERNEL);
if (!loop_dev)
goto out_mem1;
memset(loop_dev, 0, max_loop * sizeof(struct loop_device));
disks = kmalloc(max_loop * sizeof(struct gendisk *), GFP_KERNEL);
if (!disks)
goto out_mem2;
for (i = 0; i < max_loop; i++) {
disks[i] = alloc_disk(1);
if (!disks[i])
goto out_mem3;
}
2、磁盘操作实体
上面只是分配的一个磁盘,注意,分配的不是设备描述符,而直接就是磁盘,也就是创建的是一个虚拟设备。这个磁盘其实本身并没有做任何实质性的操作,因为从执行的函数可以看到,它大部分情况下使用的都是通用而非定制接口,所以分配的磁盘也就是通用的磁盘,这个没有加上自己定制的接口,那么一定无法完成这个设备的虚拟。
这个虚拟磁盘可以认为是一个中转,也就是“铁打的营盘流水的兵”。这个磁盘虽然是创建了,但是它并没有实际的内容,并且既是它有了实际的内容,那么它的内容在通常情况下也会变化,例如在制作img之后马上被卸载。
这个设备真正的点睛之笔是通过ioctl中的set_fd操作来完成的,这个是一个比较另类的操作,也就是它并不是通过通常的open之类接口完成,而是通过这种设备完成,所以还是比较独特的。
static int lo_ioctl(struct inode * inode, struct file * file,
unsigned int cmd, unsigned long arg)
case LOOP_SET_FD:
err = loop_set_fd(lo, file, inode->i_bdev, arg);
/*
* set queue make_request_fn, and add limits based on lower level
* device
*/
blk_queue_make_request(lo->lo_queue, loop_make_request);
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;
set_capacity(disks[lo->lo_number], size);
bd_set_size(bdev, size << 9);
set_blocksize(bdev, lo_blocksize);
lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
lo->lo_number);
这里就有两个比较关键的操作,其中第一个最为关键,就是设置一个磁盘特有的make_request接口,这也就是说,对于每个不同的disk,它可以定义自己的make_request,而其它模块希望操作这个具体磁盘的时候,它就通过向这个队列发送请求原语就可以了,至于这个具体怎么实现,那就是具体磁盘自己的事情了。这样就很好的解耦了系统中的不同模块,就是接口单一的原则。这个接口事实上是作为一个block设备对其它模块做的承诺,这样它才像是一个block设备。
然后第二个就是一个不太必须但是比较特别的东西,就是在每个设备设置了文件描述符之后,都会创建一个对应的内核线程,由这个线程来完成操作。
这个内核线程我们在losetup之后可以看到内核中的确是有这个线程的。
例如,我们可以在挂载了loop文件之后,系统中有一个文件loop4内核线程
tsecer 18014 0.0 6.4 77968 66284 pts/1 S+ 07:02 0:00 gdb vmlinux
root 27397 0.0 0.0 0 0 ? S< 23:10 0:00 [loop4]
root 27580 0.0 0.1 55048 2000 ? S 23:45 0:00 /usr/libexec/fp
tsecer 27584 0.0 0.0 4692 992 pts/0 R+ 23:45 0:00 ps aux
而其实这个loop_make_request的接口也比较简单,它就是把具体的设备操作转换为真正的文件对应的操作接口。例如wirte转换为backup文件的write接口,从而完成中间层的转发。
3、用户态线程的操作
util-linux-ng-2.16.2\mount\lomount.c
#define _PATH_DEV_LOOP "/dev/loop"
#define NLOOPS_DEFAULT 8 /* /dev/loop[0-7] */
static int
looplist_open_dev(struct looplist *ll, int lnum)
{
struct stat st;
int used;
int fd;
/* create a full device path */
snprintf(ll->name, sizeof(ll->name),
ll->flag & LLFLG_SUBDIR ?
_PATH_DEV_LOOP "/%d" :
_PATH_DEV "loop%d",
lnum);
looplist_next(struct looplist *ll)
/* B) Classic way, try first eight loop devices (default number
* of loop devices). This is enough for 99% of all cases.
*/
if (ll->flag & LLFLG_DFLT) {
for (++ll->ncur; ll->ncur < NLOOPS_DEFAULT; ll->ncur++) {
fd = looplist_open_dev(ll, ll->ncur);
if (fd != -1)
return fd;
}
ll->flag &= ~LLFLG_DFLT;
}
然后执行
res = set_loop(device, file, off, slimit, encryption, pfd, &ro);--->>>
if (ioctl(fd, LOOP_SET_FD, ffd) < 0) {
从而向内核注册了对应的文件。
4、strace输出
[tsecer@Harry linux-2.6.21]$ su -c "strace mount -t ext2 -o loop /home/tsecer/KernelDebug/helloword.c /dev/"
Password:
execve("/bin/mount", ["mount", "-t", "ext2", "-o", "loop", "/home/tsecer/KernelDebug/hellowo"..., "/dev/"], [/* 44 vars */]) = 0
brk(0) = 0x24c7000
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7842000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=85504, ...}) = 0
mmap2(NULL, 85504, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb782d000
close(3) = 0
open("/lib/libblkid.so.1", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20$}\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=85520, ...}) = 0
mmap2(NULL, 86652, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xfb2000
mmap2(0xfc6000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x13) = 0xfc6000
close(3) = 0
open("/lib/libuuid.so.1", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0`.\306\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=16112, ...}) = 0
mmap2(NULL, 17072, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xdff000
mmap2(0xe03000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3) = 0xe03000
close(3) = 0
open("/lib/libselinux.so.1", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0`\201?\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=118316, ...}) = 0
mmap2(NULL, 121848, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x56d000
mmap2(0x589000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b) = 0x589000
close(3) = 0
open("/lib/libsepol.so.1", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\340\257V\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=242288, ...}) = 0
mmap2(NULL, 244992, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x9c9000
mmap2(0xa04000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x3a) = 0xa04000
close(3) = 0
open("/lib/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\3\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\20\r\"\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1831904, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb782c000
mmap2(NULL, 1542504, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x110000
mprotect(0x282000, 4096, PROT_NONE) = 0
mmap2(0x283000, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x172) = 0x283000
mmap2(0x286000, 10600, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x286000
close(3) = 0
open("/lib/libdl.so.2", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\3\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0`Z8\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=20480, ...}) = 0
mmap2(NULL, 16500, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xcf8000
mmap2(0xcfb000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2) = 0xcfb000
close(3) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb782b000
set_thread_area({entry_number:-1 -> 6, base_addr:0xb782b750, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}) = 0
mprotect(0xcfb000, 4096, PROT_READ) = 0
mprotect(0x283000, 8192, PROT_READ) = 0
mprotect(0x589000, 4096, PROT_READ) = 0
mprotect(0x9af000, 4096, PROT_READ) = 0
munmap(0xb782d000, 85504) = 0
statfs64("/selinux", 84, {f_type=0xf97cff8c, f_bsize=4096, f_blocks=0, f_bfree=0, f_bavail=0, f_files=0, f_ffree=0, f_fsid={0, 0}, f_namelen=255, f_frsize=4096}) = 0
brk(0) = 0x24c7000
brk(0x24e8000) = 0x24e8000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=98765760, ...}) = 0
mmap2(NULL, 2097152, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb762b000
close(3) = 0
umask(022) = 02
open("/dev/null", O_RDWR|O_LARGEFILE) = 3
close(3) = 0
getuid32() = 0
geteuid32() = 0
readlink("/home", 0xbff60c5b, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer", 0xbff60c5b, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer/KernelDebug", 0xbff60c5b, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer/KernelDebug/helloword.c", 0xbff60c5b, 4096) = -1 EINVAL (Invalid argument)
readlink("/dev", 0xbff60adb, 4096) = -1 EINVAL (Invalid argument)
umask(077) = 022
open("/etc/mtab", O_RDONLY|O_LARGEFILE) = 3
umask(022) = 077
fstat64(3, {st_mode=S_IFREG|0644, st_size=470, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7841000
read(3, "/dev/mapper/vg_harry-lv_root / e"..., 4096) = 470
read(3, "", 4096) = 0
close(3) = 0
munmap(0xb7841000, 4096) = 0
stat64("/dev/", {st_mode=S_IFDIR|0755, st_size=4040, ...}) = 0
stat64("/dev/loop", 0xbff619d0) = -1 ENOENT (No such file or directory)
open("/dev/loop0", O_RDONLY|O_LARGEFILE) = 3
fstat64(3, {st_mode=S_IFBLK|0660, st_rdev=makedev(7, 0), ...}) = 0
ioctl(3, 0x4c03, 0xbff5f8b0) = -1 ENXIO (No such device or address)
close(3) = 0
open("/home/tsecer/KernelDebug/helloword.c", O_RDWR|O_LARGEFILE) = 3
open("/dev/loop0", O_RDWR|O_LARGEFILE) = 4
readlink("/home", 0xbff608cb, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer", 0xbff608cb, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer/KernelDebug", 0xbff608cb, 4096) = -1 EINVAL (Invalid argument)
readlink("/home/tsecer/KernelDebug/helloword.c", 0xbff608cb, 4096) = -1 EINVAL (Invalid argument)
ioctl(4, 0x4c00, 0x3) = 0
close(3) = 0
ioctl(4, 0x4c04, 0xbff61988) = 0
ioctl(4, 0x4c05, 0xbff61788) = 0
stat64("/sbin/mount.ext2", 0xbff619f8) = -1 ENOENT (No such file or directory)
rt_sigprocmask(SIG_BLOCK, ~[TRAP SEGV RTMIN RT_1], NULL, 8) = 0
stat64("/sbin/mount.ext2", 0xbff619b8) = -1 ENOENT (No such file or directory)
这里进行了真正的mount操作,也就是在这里进行了mount系统调用,而之前已经通过对设备文件的ioctl,设置了回环文件使用的真正文件的位置。也就是当执行mount之前,用户态的mount已经进行了loop文件的打开和设置,然后在mount中就作为一个普通的设备文件进行操作了。
mount("/dev/loop0", "/dev/", "ext2", MS_MGC_VAL, NULL) = -1 EINVAL (Invalid argument)
rt_sigprocmask(SIG_UNBLOCK, ~[TRAP SEGV RTMIN RT_1], NULL, 8) = 0
open("/usr/share/locale/locale.alias", O_RDONLY) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=2512, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7841000
read(3, "# Locale name alias data base.\n#"..., 4096) = 2512
read(3, "", 4096) = 0
close(3) = 0
munmap(0xb7841000, 4096) = 0
open("/usr/share/locale/en_US.UTF-8/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US.utf8/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en_US/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.UTF-8/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en.utf8/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
open("/usr/share/locale/en/LC_MESSAGES/util-linux-ng.mo", O_RDONLY) = -1 ENOENT (No such file or directory)
write(2, "mount: wrong fs type, bad option"..., 121mount: wrong fs type, bad option, bad superblock on /dev/loop0,
missing codepage or helper program, or other error) = 121
write(2, "\n", 1
) = 1
stat64("/dev/loop0", {st_mode=S_IFBLK|0660, st_rdev=makedev(7, 0), ...}) = 0
open("/dev/loop0", O_RDONLY|O_NONBLOCK|O_LARGEFILE) = 3
uname({sys="Linux", node="Harry", ...}) = 0
ioctl(3, BLKGETSIZE64, 0xbff61cb8) = 0
write(2, " (could this be the IDE de"..., 111 (could this be the IDE device where you in fact use
ide-scsi so that sr0 or sda or so is needed?)) = 111
write(2, "\n", 1
) = 1
close(3) = 0
write(2, " In some cases useful info"..., 85 In some cases useful info is found in syslog - try
dmesg | tail or so
) = 85
write(2, "\n", 1
) = 1
exit_group(32) = ?
[tsecer@Harry linux-2.6.21]$