/dev/mem同步写不能使用msync的MS_SYNC选项探究
问题
做了个测试板子的程序,里面有一项写铁电的功能,要求写入之后立即断电,重启后校验数据准确性;铁电设计是通过内存地址直接映射的,于是,使用mmap直接映射了/dev/mem文件,自然地写入之后使用msync进行同步,最后使用munmap解映射;
然而,当我运行这段程序的时候,发现msync的MS_SYNC选项进行同步的时候会返回错误,错误码是EINVAL;这就奇怪了;
查原因
1. 查看MAN手册,如下:当地址不是页的整数倍,或者参数传递错误时才返回这个结果;
1 EINVAL addr is not a multiple of PAGESIZE; or any bit other than MS_ASYNC | MS_INVALIDATE | MS_SYNC is set in flags; or both MS_SYNC 2 and MS_ASYNC are set in flags.
反复验证,发现地址没问题,而且将MS_SYNC换成MS_ASYNC就没问题了,所以怀疑是内核不支持这个同步选项;为了求证,查看内核代码:
2. sys_msync这个系统调用,在校验参数时,如果不合法会返回-EINVAL,这点如上述MAN手册所描述;
1 asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 2 { 3 unsigned long end; 4 struct mm_struct *mm = current->mm; 5 struct vm_area_struct *vma; 6 int unmapped_error = 0; 7 int error = -EINVAL; 8 9 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 10 goto out; 11 if (start & ~PAGE_MASK) 12 goto out; 13 if ((flags & MS_ASYNC) && (flags & MS_SYNC)) 14 goto out; 15 .... 16 }
3. 继续往下看代码,有这么一句,如果有MS_SYNC标记的话,会执行do_fsync(),出错会返回error;
1 asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 2 { 3 ... 4 if ((flags & MS_SYNC) && file && 5 (vma->vm_flags & VM_SHARED)) { 6 get_file(file); 7 up_read(&mm->mmap_sem); 8 error = do_fsync(file, 0); 9 fput(file); 10 if (error || start >= end) 11 goto out; 12 down_read(&mm->mmap_sem); 13 vma = find_vma(mm, start); 14 } else { 15 if (start >= end) { 16 error = 0; 17 goto out_unlock; 18 } 19 vma = vma->vm_next; 20 } 21 } 22 out_unlock: 23 up_read(&mm->mmap_sem); 24 out: 25 return error ? : unmapped_error; 26 }
4. 在do_fsync函数中,会对file_operations和里面的fsync函数做校验,如果没有,则返回-EINVAL,基本上可以确定,正是因为该文件没有实现file_operations里面的fsync函数,所以返回参数错误了;
1 long do_fsync(struct file *file, int datasync) 2 { 3 int ret; 4 int err; 5 struct address_space *mapping = file->f_mapping; 6 7 if (!file->f_op || !file->f_op->fsync) { 8 /* Why? We can still call filemap_fdatawrite */ 9 ret = -EINVAL; 10 goto out; 11 } 12 13 ret = filemap_fdatawrite(mapping); 14 15 /* 16 * We need to protect against concurrent writers, which could cause 17 * livelocks in fsync_buffers_list(). 18 */ 19 mutex_lock(&mapping->host->i_mutex); 20 err = file->f_op->fsync(file, file->f_path.dentry, datasync); 21 if (!ret) 22 ret = err; 23 mutex_unlock(&mapping->host->i_mutex); 24 err = filemap_fdatawait(mapping); 25 if (!ret) 26 ret = err; 27 out: 28 return ret; 29 }
5. 我们来看看内存设备是在什么时候初始化的,如下代码,在device_create函数调用中会对一系列的内存设备进行初始化,其中包括/dev/mem;
1 static int __init chr_dev_init(void) 2 { 3 int i; 4 int err; 5 6 err = bdi_init(&zero_bdi); 7 if (err) 8 return err; 9 10 if (register_chrdev(MEM_MAJOR,"mem",&memory_fops)) 11 printk("unable to get major %d for memory devs\n", MEM_MAJOR); 12 13 mem_class = class_create(THIS_MODULE, "mem"); 14 for (i = 0; i < ARRAY_SIZE(devlist); i++) 15 device_create(mem_class, NULL, 16 MKDEV(MEM_MAJOR, devlist[i].minor), 17 devlist[i].name); 18 19 return 0; 20 }
6. 这个/dev/mem对应着一个操作函数,如下代码中的mem_fops:
1 static const struct { 2 unsigned int minor; 3 char *name; 4 umode_t mode; 5 const struct file_operations *fops; 6 } devlist[] = { /* list of minor devices */ 7 {1, "mem", S_IRUSR | S_IWUSR | S_IRGRP, &mem_fops}, 8 {2, "kmem", S_IRUSR | S_IWUSR | S_IRGRP, &kmem_fops}, 9 {3, "null", S_IRUGO | S_IWUGO, &null_fops}, 10 #ifdef CONFIG_DEVPORT 11 {4, "port", S_IRUSR | S_IWUSR | S_IRGRP, &port_fops}, 12 #endif 13 {5, "zero", S_IRUGO | S_IWUGO, &zero_fops}, 14 {7, "full", S_IRUGO | S_IWUGO, &full_fops}, 15 {8, "random", S_IRUGO | S_IWUSR, &random_fops}, 16 {9, "urandom", S_IRUGO | S_IWUSR, &urandom_fops}, 17 {11,"kmsg", S_IRUGO | S_IWUSR, &kmsg_fops}, 18 #ifdef CONFIG_CRASH_DUMP 19 {12,"oldmem", S_IRUSR | S_IWUSR | S_IRGRP, &oldmem_fops}, 20 #endif 21 };
7. 看看这个mem_fops的实现,如下,可见其并没有实现fsync函数;
1 static const struct file_operations mem_fops = { 2 .llseek = memory_lseek, 3 .read = read_mem, 4 .write = write_mem, 5 .mmap = mmap_mem, 6 .open = open_mem, 7 .get_unmapped_area = get_unmapped_area_mem, 8 };
到这,问题总算水落石出了;
8. 再来看看mmap函数的实现,里面调用了这个函数phys_mem_access_prot;
1 static int mmap_mem(struct file * file, struct vm_area_struct * vma) 2 { 3 size_t size = vma->vm_end - vma->vm_start; 4 5 if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size)) 6 return -EINVAL; 7 8 if (!private_mapping_ok(vma)) 9 return -ENOSYS; 10 11 vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, 12 size, 13 vma->vm_page_prot); 14 15 /* Remap-pfn-range will mark the range VM_IO and VM_RESERVED */ 16 if (remap_pfn_range(vma, 17 vma->vm_start, 18 vma->vm_pgoff, 19 size, 20 vma->vm_page_prot)) 21 return -EAGAIN; 22 return 0; 23 }
9. 上面提到的这个函数,如下,其中有个是否支持不缓存的方式判断,uncached_access;
1 #ifndef __HAVE_PHYS_MEM_ACCESS_PROT 2 static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 3 unsigned long size, pgprot_t vma_prot) 4 { 5 #ifdef pgprot_noncached 6 unsigned long offset = pfn << PAGE_SHIFT; 7 8 if (uncached_access(file, offset)) 9 return pgprot_noncached(vma_prot); 10 #endif 11 return vma_prot; 12 } 13 #endif
10. 进入uncached_access非缓存访问函数,可见其内部根据文件的O_SYNC选项来判断是否支持不缓存的写;
1 static inline int uncached_access(struct file *file, unsigned long addr) 2 { 3 #if defined(__i386__) && !defined(__arch_um__) 4 /* 5 * On the PPro and successors, the MTRRs are used to set 6 * memory types for physical addresses outside main memory, 7 * so blindly setting PCD or PWT on those pages is wrong. 8 * For Pentiums and earlier, the surround logic should disable 9 * caching for the high addresses through the KEN pin, but 10 * we maintain the tradition of paranoia in this code. 11 */ 12 if (file->f_flags & O_SYNC) 13 return 1; 14 return !( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || 15 test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || 16 test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || 17 test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability) ) 18 && addr >= __pa(high_memory); 19 #elif defined(__x86_64__) && !defined(__arch_um__) 20 /* 21 * This is broken because it can generate memory type aliases, 22 * which can cause cache corruptions 23 * But it is only available for root and we have to be bug-to-bug 24 * compatible with i386. 25 */ 26 if (file->f_flags & O_SYNC) 27 return 1; 28 /* same behaviour as i386. PAT always set to cached and MTRRs control the 29 caching behaviour. 30 Hopefully a full PAT implementation will fix that soon. */ 31 return 0; 32 #elif defined(CONFIG_IA64) 33 /* 34 * On ia64, we ignore O_SYNC because we cannot tolerate memory attribute aliases. 35 */ 36 return !(efi_mem_attributes(addr) & EFI_MEMORY_WB); 37 #elif defined(CONFIG_MIPS) 38 { 39 extern int __uncached_access(struct file *file, 40 unsigned long addr); 41 42 return __uncached_access(file, addr); 43 } 44 #else 45 /* 46 * Accessing memory above the top the kernel knows about or through a file pointer 47 * that was marked O_SYNC will be done non-cached. 48 */ 49 if (file->f_flags & O_SYNC) 50 return 1; 51 return addr >= __pa(high_memory); 52 #endif 53 }
好了,分析完毕;
解决办法
在打开/dev/mem时,使用如下方式,即open增加O_SYNC选项,这个选项即上面uncached_access函数使用的判断标记,表示每次写操作都要等到数据和文件属性都同步到物理存储才返回;
1 int fd = open("/dev/mem", O_RDWR|O_SYNC);
参考文章:
https://blog.csdn.net/wlp600/article/details/6893636
http://www.armadeus.org/wiki/index.php?title=FPGA_registers_access_from_Linux_userspace
https://blog.csdn.net/tiantao2012/article/details/52168383?locationNum=2&fps=1