猫猫哥

  博客园 :: 首页 :: 博问 :: 闪存 :: 新随笔 :: 联系 :: 订阅 订阅 :: 管理 ::

1 题面

编写类似cp(1)的程序,它复制包含空洞的文件,但是不将字节0写到输出文件中去。

2 基本思路

  • 首先要搞清楚空洞的性质以判断一个文件是否有空洞,以及空洞的位置
  • 知道了空洞的位置之后,读到源文件中的空洞部分时,在目标文件中lseek相应的长度

3 创建空洞文件,同时探索空洞性质

交替lseekwrite,逐渐增大间隔长度。比较文件的大小和实际占用的block数目

  • 测试源码
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>

int holesize[]={1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32*1024};
int filesize = 64*1024;

int main()
{
    int i = 0;
    int count = 0;
    int ret = 0, fd = 0;
    char filename[32]={0};
    unsigned char buf[32*1024]={0};
    memset(buf, 1, 32*1024);
    for (; i< sizeof(holesize)/ sizeof(int); ++i) {
        count = 0;
        memset(filename, 0, 32);
        sprintf(filename, "%s%d", "holesize", holesize[i]);
        fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if(fd < 0) {
            printf("open file fail\n");
            return -1;
        }
        while(count < filesize) {
            ret = lseek(fd, holesize[i], SEEK_CUR);
            if(ret < 0) {
                printf("lseek fail\n");
                return -1;
            }
            int remain = holesize[i];
            while(remain) {
                ret = write(fd, buf, remain);
                if(ret < 0 ) {
                    perror("write fail\n");
                    return -1;
                }
                remain -= ret;
            }
            count += holesize[i] * 2;
        }
        close(fd);
    }
    return 0;
}
  • MAC OSX 10.1.4.6测试结果
^_^$ ll -s
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize1024
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize128
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize16384
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize2048
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize256
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize32768
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize4096
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize512
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize64
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8
128 -rw-r--r--   1 chenzf  staff  65536 12 28 20:08 holesize8192

Mac OSX上创建不了空洞文件,因为默认的文件系统是HFS +,不支持稀疏文件

  • Ubuntu18 4.15.0-60-generic测试结果
^_^$ ll -s
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize1024
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize128
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize16384
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize2048
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize256
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize32768
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize4096
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize512
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize64
64 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8
32 -rw-r--r-- 1 chen chen 65536 12月 25 00:08 holesize8192

4KB以上才实际创建空洞。
因为在linux的文件系统中,磁盘分配的最小物理单元为簇。(即使文件大小不足以占用满一簇,该簇空余的磁盘存储仍旧是该文件的)

所以可以根据这个性质,判断文件是否是空洞文件。有空洞的文件,用文件大小计算的block数至少比实际占用的block数大1个簇的block数

如何可移植地获取簇的大小

pagesize = sysconf(_SC_PAGESIZE);

初步实现功能

  • 源码
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>

int my_cp(const char *from, const char *to)
{
    int fd1 = -1, fd2 = -1;
    int rev = -1;
    unsigned char *buffer = NULL;
    unsigned char *start_pos = NULL;
    long pagesize = 0;
    long long blocks, blksize, size;
    int read_num, write_num, remain_num, current_pos = 0, last_zero = -1, last_nonzero = -1, have_holes = 0;
    struct stat st;

    fd1 = open(from, O_RDONLY);
    if(-1 == fd1){
        perror("open file1 faild");
        goto err;
    }

    if(fstat(fd1, &st) !=0) {
        perror("fstat: ");
        goto err;
    }
    else{
#ifdef _SC_PAGESIZE
        pagesize = sysconf(_SC_PAGESIZE);
        if (pagesize < 0) {
            if (errno != 0) {
                if (errno == EINVAL) {
                    fputs(" (not supported)\n", stdout);
                    pagesize = st.st_blksize;
                }
                else {
                    perror("sysconf error");
                    goto err;
                }
            } else {
                fputs(" (no limit)\n", stdout);
                pagesize = st.st_blksize;
            }
        }
        printf("pagesize: %ld\n", pagesize);
#else
        pagesize = st.st_blksize;
#endif
        blocks = st.st_blocks;
        blksize = st.st_blksize;
        size = st.st_size;
        printf("st.st_blocks: %lld\n", blocks);
        printf("st.st_blksize: %lld\n", blksize);
        printf("st.st_size: %lld\n", size);
        /*块大小512,在不同平台上可能不兼容*/
        if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
            have_holes = 1;
            printf("%s is a sparse-block file!\n", from);
        } else{
            have_holes = 0;
            printf("%s is not a sparse-block file!\n", from);
        }
    }
    fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
    if ( -1 == fd2) {
        perror ("open file2 faild");
        goto err;
    }

    buffer = malloc(pagesize);
    if(buffer == NULL) {
        perror ("malloc fail");
        goto err;
    }
    memset(buffer, '\0', pagesize);
    while((read_num = read(fd1, buffer, pagesize)) > 0) {
        /* 源文件有空洞 */
        if(have_holes){
            last_zero = -1;
            last_nonzero = -1;
            for(current_pos = 0; current_pos < read_num; current_pos++){
                /* 逐字节判断,效率较低*/
                if(buffer[current_pos] == 0){
                    if(last_nonzero > last_zero){
                        remain_num = last_nonzero - last_zero;
                        start_pos = buffer + last_zero + 1;
                        while(remain_num){
                            write_num = write(fd2, start_pos, remain_num);
                            if ( -1 == write_num){
                                perror( "write file2 error");
                                goto err;
                            }
                            remain_num -= write_num;
                            start_pos += write_num;
                        }
                    }
                    last_zero = current_pos;
                }
                else{
                    if(last_zero > last_nonzero){
                        remain_num = last_zero - last_nonzero;
                        if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                            perror("lseek file2 fail");
                            goto err;
                        }
                    }
                    last_nonzero = current_pos;
                }
            }
            /* 处理最后剩余数据*/
            remain_num = (last_nonzero > last_zero)?(last_nonzero - last_zero):(last_zero - last_nonzero);
            start_pos = buffer + current_pos - remain_num;
            if(last_nonzero > last_zero){
                while(remain_num){
                    write_num = write(fd2, start_pos, remain_num);
                    if ( -1 == write_num){
                        perror( "write file2 error");
                        goto err;
                    }
                        remain_num -= write_num;
                        start_pos += write_num;
                    }
                }
            else{
                if(-1 == lseek(fd2, remain_num, SEEK_CUR)){
                    perror("lseek file2 fail");
                    goto err;
                }
            }
        }
        /* 源文件无空洞 */
        else {
            remain_num = read_num;
            start_pos = buffer;
            while(remain_num){
                write_num = write(fd2, start_pos, remain_num);
                if ( -1 == write_num){
                    perror( "write file2 error");
                    goto err;
                }
                remain_num -= write_num;
                start_pos += write_num;
            }
        }
    }
    if(-1 == read_num) {
        perror("read file1 error");
        goto err;
    }
    rev = 0;
err:
    if(buffer) free(buffer);
    close(fd1);
    close(fd2);
    return rev;
}

int main(int argc, char *argv[])
{
    if(argc < 3) {
        printf("Usage: %s file1 file2\n", argv[0]);
        return -1;
    }
    my_cp(argv[1], argv[2]);
    return 0;
}
  • 测试结果
^_^$ ./my_cp holesize2048 holesize2048.cp
pagesize: 4096
st.st_blocks: 128
st.st_blksize: 4096
st.st_size: 65536
holesize2048 is not a sparse-block file!
chen@ubuntu18:~/study/apue.3e/exercises/4
^_^$ ./my_cp holesize4096 holesize4096.cp
pagesize: 4096
st.st_blocks: 72
st.st_blksize: 4096
st.st_size: 65536
holesize4096 is a sparse-block file!

^_^$ ll -s
total 1708
64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048
64 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize2048.cp
36 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096
32 -rw-r--r-- 1 chen chen  65536 1月   6 17:27 holesize4096.cp

空洞文件可以正常拷贝

尝试优化程序

上面的程序仅在判断文件是否含有空洞时利用的空洞的最小限制。而在实际读写时并没有利用该性质。

这样较短的0字节也会当成是空洞,导致系统调用次数的增加,性能的降低

要优化性能,必须进一步探究空洞的性质。在什么样的情况下才创建空洞(不实际占用磁盘空间的块)?

  • 测试程序源码

此程序创建了3个文件:

- 文件1先`write`了1K的非零数据,然后`lseek` 7K-1字节。循环2次。
- 文件2先`write`了1K的非零数据,然后`lseek` 7K字节。循环2次
- 文件3先`write`了1K的非零数据,然后`lseek` 7K+1字节。循环2次
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>

int holesize[]={4096};
int filesize = 64*1024;

int main()
{
    int i = 0;
    int count = 0;
    int ret = 0, fd1 = 0, fd2 = 0, fd3 = 0;
    char filename1[32]={0};
    char filename2[32]={0};
    char filename3[32]={0};
    unsigned char buf[32*1024]={0};
    memset(buf, 1, 32*1024);
    for (; i< sizeof(holesize)/ sizeof(int); ++i) {
        count = 0;
        memset(filename1, 0, 32);
        memset(filename2, 0, 32);
        memset(filename3, 0, 32);
        sprintf(filename1, "%s%d-1", "holesize", holesize[i]);
        sprintf(filename2, "%s%d-2", "holesize", holesize[i]);
        sprintf(filename3, "%s%d-3", "holesize", holesize[i]);
        fd1 = open(filename1, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        fd2 = open(filename2, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        fd3 = open(filename3, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
        if(fd1 < 0 || fd2 < 0 || fd3 < 0) {
            printf("open file fail\n");
            return -1;
        }
        count = 0;
        while(count < 2) {
            int remain = holesize[i] * 1 / 4;
            while(remain) {
                ret = write(fd1, buf, remain);
                if(ret < 0 ) {
                    perror("write fail\n");
                    return -1;
                }
                remain -= ret;
            }
            ret = lseek(fd1, holesize[i] * 7 / 4 - 1, SEEK_CUR);
            if(ret < 0) {
                printf("lseek fail\n");
                return -1;
            }
            ++count;
        }
        count = 0;
        while(count < 2) {
            int remain = holesize[i] * 1 / 4;
            while(remain) {
                ret = write(fd2, buf, remain);
                if(ret < 0 ) {
                    perror("write fail\n");
                    return -1;
                }
                remain -= ret;
            }
            ret = lseek(fd2, holesize[i] * 7 / 4, SEEK_CUR);
            if(ret < 0) {
                printf("lseek fail\n");
                return -1;
            }
            ++count;
        }
        count = 0;
        while(count < 2) {
            int remain = holesize[i] * 1 / 4;
            while(remain) {
                ret = write(fd3, buf, remain);
                if(ret < 0 ) {
                    perror("write fail\n");
                    return -1;
                }
                remain -= ret;
            }
            ret = lseek(fd3, holesize[i] * 7 / 4 + 1, SEEK_CUR);
            if(ret < 0) {
                printf("lseek fail\n");
                return -1;
            }
            ++count;
        }
        close(fd1);
        close(fd2);
        close(fd3);
    }
    return 0;
}
  • 测试结果
^_^$ ll -s
12 -rw-r--r-- 1 chen chen  9215 1月   6 15:07 holesize4096-1
 8 -rw-r--r-- 1 chen chen  9216 1月   6 15:07 holesize4096-2
 8 -rw-r--r-- 1 chen chen  9217 1月   6 15:07 holesize4096-3

可见空洞必须从一页的起始位置开始计算,并且等于或超过pagesize,才不占用实际磁盘空间

优化后程序

  • 源码
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <sys/errno.h>

ssize_t read_ex(int fd, void *buf, size_t nbyte){
    size_t read_remain = nbyte;
    unsigned char *read_start = (unsigned char*)buf;
    ssize_t read_num = -1;
    ssize_t total_num = 0;
    while(read_remain) {
        read_num = read(fd, read_start, read_remain);
        if(-1 == read_num){
            return -1;
        }
        else if(0 == read_num){
            break;
        }
        else{
            read_remain -= read_num;
            read_start += read_num;
            total_num += read_num;
        }
    }
    return total_num;
}

ssize_t write_ex(int fd, const void *buf, size_t nbyte){
    size_t write_remain = nbyte;
    unsigned char *write_start = (unsigned char*)buf;
    ssize_t write_num = -1;
    ssize_t total_num = 0;
    while(write_remain) {
        write_num = write(fd, write_start, write_remain);
        if(-1 == write_num){
            return -1;
        }
        else{
            write_remain -= write_num;
            write_start += write_num;
            total_num += write_num;
        }
    }
    return total_num;
}
int my_cp(const char *from, const char *to)
{
    int fd1 = -1, fd2 = -1;
    int rev = -1;
    unsigned char *buffer = NULL, *buffer_zero = NULL;
    long pagesize = 0;
    long long blocks, blksize, size;
    int read_num, write_num, write_remain, have_holes = 0;
    struct stat st;

    fd1 = open(from, O_RDONLY);
    if(-1 == fd1){
        perror("open file1 faild");
        goto err;
    }

    if(fstat(fd1, &st) !=0) {
        perror("fstat: ");
        goto err;
    }
    else{
#ifdef _SC_PAGESIZE
        pagesize = sysconf(_SC_PAGESIZE);
        if (pagesize < 0) {
            if (errno != 0) {
                if (errno == EINVAL) {
                    fputs(" (not supported)\n", stdout);
                    pagesize = st.st_blksize;
                }
                else {
                    perror("sysconf error");
                    goto err;
                }
            } else {
                fputs(" (no limit)\n", stdout);
                pagesize = st.st_blksize;
            }
        }
        printf("pagesize: %ld\n", pagesize);
#else
        pagesize = st.st_blksize;
#endif
        blocks = st.st_blocks;
        blksize = st.st_blksize;
        size = st.st_size;
        printf("st.st_blocks: %lld\n", blocks);
        printf("st.st_blksize: %lld\n", blksize);
        printf("st.st_size: %lld\n", size);
        /*块大小512,在不同平台上可能不兼容*/
        if(S_ISREG(st.st_mode) && (size / pagesize + (size%pagesize?1:0)) * pagesize > 512 * blocks) {
            have_holes = 1;
            printf("%s is a sparse-block file!\n", from);
        } else{
            have_holes = 0;
            printf("%s is not a sparse-block file!\n", from);
        }
    }
    buffer = malloc(pagesize);
    buffer_zero = malloc(pagesize);
    if(buffer == NULL || buffer_zero == NULL) {
        perror ("malloc fail");
        goto err;
    }
    memset(buffer, '\0', pagesize);
    memset(buffer_zero, '\0', pagesize);

    fd2 = open(to, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
    if (-1 == fd2) {
        perror ("open file2 faild");
        goto err;
    }

    while((read_num = read_ex(fd1, buffer, pagesize)) > 0) {
        /* 读取到空洞 */
        if(have_holes && !memcmp(buffer_zero, buffer, read_num)){
            if(-1 == lseek(fd2, read_num, SEEK_CUR)){
                perror("lseek file2 fail");
                goto err;
            }
        }
        /* 非空洞 */
        else{
            write_num = write_ex(fd2, buffer, read_num);
            if (-1 == write_num){
                perror( "write file2 error");
                goto err;
            }
        }
    }
    if(-1 == read_num){
        perror("read file1 error");
        goto err;
    }
    rev = 0;
err:
    if(buffer) free(buffer);
    if(buffer_zero) free(buffer_zero);
    close(fd1);
    close(fd2);
    return rev;
}

int main(int argc, char *argv[])
{
    if(argc < 3) {
        printf("Usage: %s file1 file2\n", argv[0]);
        return -1;
    }
    my_cp(argv[1], argv[2]);
    return 0;
}
  • 对比测试

构造一个文件,除了开头一个空洞,其余数据为0x00,0x01的100000次重复

用优化前的程序拷贝该文件10000次,大约2000s

用优化后的程序拷贝该文件10000次,大约30s

posted on 2020-01-06 18:33  猫猫哥  阅读(429)  评论(0编辑  收藏  举报