1 题面
编写类似cp(1)的程序,它复制包含空洞的文件,但是不将字节0写到输出文件中去。
2 基本思路
- 首先要搞清楚空洞的性质以判断一个文件是否有空洞,以及空洞的位置
- 知道了空洞的位置之后,读到源文件中的空洞部分时,在目标文件中lseek相应的长度
3 创建空洞文件,同时探索空洞性质
交替lseek
和write
,逐渐增大间隔长度。比较文件的大小和实际占用的block数目
- 测试源码
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
int holesize[]={1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32*1024};
int filesize = 64*1024;
int main()
{
int i = 0;
int count = 0;
int ret = 0, fd = 0;
char filename[32]={0};
unsigned char buf[32*1024]={0};
memset(buf, 1, 32*1024);
for (; i< sizeof(holesize)/ sizeof(int); ++i) {
count = 0;
memset(filename, 0, 32);
sprintf(filename, "%s%d", "holesize", holesize[i]);
fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if(fd < 0) {
printf("open file fail\n");
return -1;
}
while(count < filesize) {
ret = lseek(fd, holesize[i], SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
int remain = holesize[i];
while(remain) {
ret = write(fd, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
count += holesize[i] * 2;
}
close(fd);
}
return 0;
}
- MAC OSX 10.1.4.6测试结果
^_^$ ll -s
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize1
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize1024
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize128
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize16
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize16384
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize2
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize2048
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize256
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize32
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize32768
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize4
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize4096
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize512
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize64
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize8
128 -rw-r--r-- 1 chenzf staff 65536 12 28 20:08 holesize8192
Mac OSX上创建不了空洞文件,因为默认的文件系统是HFS +,不支持稀疏文件
- Ubuntu18 4.15.0-60-generic测试结果
^_^$ ll -s
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1024
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize128
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16384
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2048
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize256
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32768
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4096
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize512
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize64
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8192
4KB以上才实际创建空洞。
因为在linux的文件系统中,磁盘分配的最小物理单元为簇。(即使文件大小不足以占用满一簇,该簇空余的磁盘存储仍旧是该文件的)
所以可以根据这个性质,判断文件是否是空洞文件。有空洞的文件,用文件大小计算的block数至少比实际占用的block数大1个簇的block数
如何可移植地获取簇的大小
pagesize = sysconf(_SC_PAGESIZE);
初步实现功能
- 源码
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>
int my_cp(const char *from, const char *to)
{
int fd1 = -1, fd2 = -1;
int rev = -1;
unsigned char *buffer = NULL;
unsigned char *start_pos = NULL;
long pagesize = 0;
long long blocks, blksize, size;
int read_num, write_num, remain_num, current_pos = 0, last_zero = -1, last_nonzero = -1, have_holes = 0;
struct stat st;
fd1 = open(from, O_RDONLY);
if(-1 == fd1){
perror("open file1 faild");
goto err;
}
if(fstat(fd1, &st) !=0) {
perror("fstat: ");
goto err;
}
else{
#ifdef _SC_PAGESIZE
pagesize = sysconf(_SC_PAGESIZE);
if (pagesize < 0) {
if (errno != 0) {
if (errno == EINVAL) {
fputs(" (not supported)\n", stdout);
pagesize = st.st_blksize;
}
else {
perror("sysconf error");
goto err;
}
} else {
fputs(" (no limit)\n", stdout);
pagesize = st.st_blksize;
}
}
printf("pagesize: %ld\n", pagesize);
#else
pagesize = st.st_blksize;
#endif
blocks = st.st_blocks;
blksize = st.st_blksize;
size = st.st_size;
printf("st.st_blocks: %lld\n", blocks);
printf("st.st_blksize: %lld\n", blksize);
printf("st.st_size: %lld\n", size);
/*块大小512,在不同平台上可能不兼容*/
if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
have_holes = 1;
printf("%s is a sparse-block file!\n", from);
} else{
have_holes = 0;
printf("%s is not a sparse-block file!\n", from);
}
}
fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if ( -1 == fd2) {
perror ("open file2 faild");
goto err;
}
buffer = malloc(pagesize);
if(buffer == NULL) {
perror ("malloc fail");
goto err;
}
memset(buffer, '\0', pagesize);
while((read_num = read(fd1, buffer, pagesize)) > 0) {
/* 源文件有空洞 */
if(have_holes){
last_zero = -1;
last_nonzero = -1;
for(current_pos = 0; current_pos < read_num; current_pos++){
/* 逐字节判断,效率较低*/
if(buffer[current_pos] == 0){
if(last_nonzero > last_zero){
remain_num = last_nonzero - last_zero;
start_pos = buffer + last_zero + 1;
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
last_zero = current_pos;
}
else{
if(last_zero > last_nonzero){
remain_num = last_zero - last_nonzero;
if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
last_nonzero = current_pos;
}
}
/* 处理最后剩余数据*/
remain_num = (last_nonzero > last_zero)?(last_nonzero - last_zero):(last_zero - last_nonzero);
start_pos = buffer + current_pos - remain_num;
if(last_nonzero > last_zero){
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
else{
if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
}
/* 源文件无空洞 */
else {
remain_num = read_num;
start_pos = buffer;
while(remain_num){
write_num = write(fd2, start_pos, remain_num);
if ( -1 == write_num){
perror( "write file2 error");
goto err;
}
remain_num -= write_num;
start_pos += write_num;
}
}
}
if(-1 == read_num) {
perror("read file1 error");
goto err;
}
rev = 0;
err:
if(buffer) free(buffer);
close(fd1);
close(fd2);
return rev;
}
int main(int argc, char *argv[])
{
if(argc < 3) {
printf("Usage: %s file1 file2\n", argv[0]);
return -1;
}
my_cp(argv[1], argv[2]);
return 0;
}
- 测试结果
^_^$ ./my_cp holesize2048 holesize2048.cp
pagesize: 4096
st.st_blocks: 128
st.st_blksize: 4096
st.st_size: 65536
holesize2048 is not a sparse-block file!
chen@ubuntu18:~/study/apue.3e/exercises/4
^_^$ ./my_cp holesize4096 holesize4096.cp
pagesize: 4096
st.st_blocks: 72
st.st_blksize: 4096
st.st_size: 65536
holesize4096 is a sparse-block file!
^_^$ ll -s
total 1708
64 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize2048
64 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize2048.cp
36 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize4096
32 -rw-r--r-- 1 chen chen 65536 1月 6 17:27 holesize4096.cp
空洞文件可以正常拷贝
尝试优化程序
上面的程序仅在判断文件是否含有空洞时利用的空洞的最小限制。而在实际读写时并没有利用该性质。
这样较短的0字节也会当成是空洞,导致系统调用次数的增加,性能的降低
要优化性能,必须进一步探究空洞的性质。在什么样的情况下才创建空洞(不实际占用磁盘空间的块)?
- 测试程序源码
此程序创建了3个文件:
- 文件1先`write`了1K的非零数据,然后`lseek` 7K-1字节。循环2次。
- 文件2先`write`了1K的非零数据,然后`lseek` 7K字节。循环2次
- 文件3先`write`了1K的非零数据,然后`lseek` 7K+1字节。循环2次
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
int holesize[]={4096};
int filesize = 64*1024;
int main()
{
int i = 0;
int count = 0;
int ret = 0, fd1 = 0, fd2 = 0, fd3 = 0;
char filename1[32]={0};
char filename2[32]={0};
char filename3[32]={0};
unsigned char buf[32*1024]={0};
memset(buf, 1, 32*1024);
for (; i< sizeof(holesize)/ sizeof(int); ++i) {
count = 0;
memset(filename1, 0, 32);
memset(filename2, 0, 32);
memset(filename3, 0, 32);
sprintf(filename1, "%s%d-1", "holesize", holesize[i]);
sprintf(filename2, "%s%d-2", "holesize", holesize[i]);
sprintf(filename3, "%s%d-3", "holesize", holesize[i]);
fd1 = open(filename1, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
fd2 = open(filename2, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
fd3 = open(filename3, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if(fd1 < 0 || fd2 < 0 || fd3 < 0) {
printf("open file fail\n");
return -1;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd1, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd1, holesize[i] * 7 / 4 - 1, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd2, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd2, holesize[i] * 7 / 4, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
count = 0;
while(count < 2) {
int remain = holesize[i] * 1 / 4;
while(remain) {
ret = write(fd3, buf, remain);
if(ret < 0 ) {
perror("write fail\n");
return -1;
}
remain -= ret;
}
ret = lseek(fd3, holesize[i] * 7 / 4 + 1, SEEK_CUR);
if(ret < 0) {
printf("lseek fail\n");
return -1;
}
++count;
}
close(fd1);
close(fd2);
close(fd3);
}
return 0;
}
- 测试结果
^_^$ ll -s
12 -rw-r--r-- 1 chen chen 9215 1月 6 15:07 holesize4096-1
8 -rw-r--r-- 1 chen chen 9216 1月 6 15:07 holesize4096-2
8 -rw-r--r-- 1 chen chen 9217 1月 6 15:07 holesize4096-3
可见空洞必须从一页的起始位置开始计算,并且等于或超过pagesize,才不占用实际磁盘空间
优化后程序
- 源码
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>
ssize_t read_ex(int fd, void *buf, size_t nbyte){
size_t read_remain = nbyte;
unsigned char *read_start = (unsigned char*)buf;
ssize_t read_num = -1;
ssize_t total_num = 0;
while(read_remain) {
read_num = read(fd, read_start, read_remain);
if(-1 == read_num){
return -1;
}
else if(0 == read_num){
break;
}
else{
read_remain -= read_num;
read_start += read_num;
total_num += read_num;
}
}
return total_num;
}
ssize_t write_ex(int fd, const void *buf, size_t nbyte){
size_t write_remain = nbyte;
unsigned char *write_start = (unsigned char*)buf;
ssize_t write_num = -1;
ssize_t total_num = 0;
while(write_remain) {
write_num = write(fd, write_start, write_remain);
if(-1 == write_num){
return -1;
}
else{
write_remain -= write_num;
write_start += write_num;
total_num += write_num;
}
}
return total_num;
}
int my_cp(const char *from, const char *to)
{
int fd1 = -1, fd2 = -1;
int rev = -1;
unsigned char *buffer = NULL, *buffer_zero = NULL;
long pagesize = 0;
long long blocks, blksize, size;
int read_num, write_num, write_remain, have_holes = 0;
struct stat st;
fd1 = open(from, O_RDONLY);
if(-1 == fd1){
perror("open file1 faild");
goto err;
}
if(fstat(fd1, &st) !=0) {
perror("fstat: ");
goto err;
}
else{
#ifdef _SC_PAGESIZE
pagesize = sysconf(_SC_PAGESIZE);
if (pagesize < 0) {
if (errno != 0) {
if (errno == EINVAL) {
fputs(" (not supported)\n", stdout);
pagesize = st.st_blksize;
}
else {
perror("sysconf error");
goto err;
}
} else {
fputs(" (no limit)\n", stdout);
pagesize = st.st_blksize;
}
}
printf("pagesize: %ld\n", pagesize);
#else
pagesize = st.st_blksize;
#endif
blocks = st.st_blocks;
blksize = st.st_blksize;
size = st.st_size;
printf("st.st_blocks: %lld\n", blocks);
printf("st.st_blksize: %lld\n", blksize);
printf("st.st_size: %lld\n", size);
/*块大小512,在不同平台上可能不兼容*/
if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
have_holes = 1;
printf("%s is a sparse-block file!\n", from);
} else{
have_holes = 0;
printf("%s is not a sparse-block file!\n", from);
}
}
buffer = malloc(pagesize);
buffer_zero = malloc(pagesize);
if(buffer == NULL || buffer_zero == NULL) {
perror ("malloc fail");
goto err;
}
memset(buffer, '\0', pagesize);
memset(buffer_zero, '\0', pagesize);
fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
if (-1 == fd2) {
perror ("open file2 faild");
goto err;
}
while((read_num = read_ex(fd1, buffer, pagesize)) > 0) {
/* 读取到空洞 */
if(have_holes && !memcmp(buffer_zero, buffer, read_num)){
if(-1 == lseek(fd2, read_num, SEEK_CUR)){
perror("lseek file2 fail");
goto err;
}
}
/* 非空洞 */
else{
write_num = write_ex(fd2, buffer, read_num);
if (-1 == write_num){
perror( "write file2 error");
goto err;
}
}
}
if(-1 == read_num){
perror("read file1 error");
goto err;
}
rev = 0;
err:
if(buffer) free(buffer);
if(buffer_zero) free(buffer_zero);
close(fd1);
close(fd2);
return rev;
}
int main(int argc, char *argv[])
{
if(argc < 3) {
printf("Usage: %s file1 file2\n", argv[0]);
return -1;
}
my_cp(argv[1], argv[2]);
return 0;
}
- 对比测试
构造一个文件,除了开头一个空洞,其余数据为0x00,0x01的100000次重复
用优化前的程序拷贝该文件10000次,大约2000s
用优化后的程序拷贝该文件10000次,大约30s