do_mmap解读

   1: unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
   2:             unsigned long len, unsigned long prot,
   3:             unsigned long flags, unsigned long pgoff)
   4: {
   5:     struct mm_struct * mm = current->mm;
   6:     struct inode *inode;
   7:     vm_flags_t vm_flags;
   8:     int error;
   9:     unsigned long reqprot = prot;
  10:  
  11:     /*
  12:      * Does the application expect PROT_READ to imply PROT_EXEC?
  13:      *
  14:      * (the exception is when the underlying filesystem is noexec
  15:      *  mounted, in which case we dont add PROT_EXEC.)
  16:      */
  17:     if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
  18:         if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
  19:             prot |= PROT_EXEC;
  20:  
  21:     if (!len)
  22:         return -EINVAL;
  23:  
  24:     if (!(flags & MAP_FIXED))
  25:         addr = round_hint_to_min(addr);

1. personality

Linux has the concept of personality of an executable (since 1.1.20). The purpose is to make the Linux environment more similar to some other environment, like BSD or SCO or Solaris or older Linux, so that foreign or old binaries have better chances of working without modification.

http://blog.chinaunix.net/uid-20357359-id-1963659.html

2. 如果调用都没有要求MAP_FIXED,即可以由内核来决定将文件映射到哪个内存地址上。

addr = round_hint_to_min(addr);

 

根据函数的名字,可以猜到它的意图是“随机化地设置映射文件的开始地址”,以防止恶意程序对代码位置的猜测和利用。

   1: /*
   2:  * If a hint addr is less than mmap_min_addr change hint to be as
   3:  * low as possible but still greater than mmap_min_addr
   4:  */
   5: static inline unsigned long round_hint_to_min(unsigned long hint)
   6: {
   7:     hint &= PAGE_MASK;
   8:     if (((void *)hint != NULL) &&
   9:         (hint < mmap_min_addr))
  10:         return PAGE_ALIGN(mmap_min_addr);
  11:     return hint;
  12: }

不过好像没有做什么?

 

映射的几种类型

   1: if (file) {
   2:         switch (flags & MAP_TYPE) {
   3:         case MAP_SHARED:
   4:             if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
   5:                 return -EACCES;
   6:  
   7:             /*
   8:              * Make sure we don't allow writing to an append-only
   9:              * file..
  10:              */
  11:             if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
  12:                 return -EACCES;
  13:  
  14:             /*
  15:              * Make sure there are no mandatory locks on the file.
  16:              */
  17:             if (locks_verify_locked(inode))
  18:                 return -EAGAIN;
  19:  
  20:             vm_flags |= VM_SHARED | VM_MAYSHARE;
  21:             if (!(file->f_mode & FMODE_WRITE))
  22:                 vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
  23:  
  24:             /* fall through */
  25:         case MAP_PRIVATE:
  26:             if (!(file->f_mode & FMODE_READ))
  27:                 return -EACCES;
  28:             if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
  29:                 if (vm_flags & VM_EXEC)
  30:                     return -EPERM;
  31:                 vm_flags &= ~VM_MAYEXEC;
  32:             }
  33:  
  34:             if (!file->f_op || !file->f_op->mmap)
  35:                 return -ENODEV;
  36:             break;
  37:  
  38:         default:
  39:             return -EINVAL;
  40:         }
  41:     } else {
  42:         switch (flags & MAP_TYPE) {
  43:         case MAP_SHARED:
  44:             /*
  45:              * Ignore pgoff.
  46:              */
  47:             pgoff = 0;
  48:             vm_flags |= VM_SHARED | VM_MAYSHARE;
  49:             break;
  50:         case MAP_PRIVATE:
  51:             /*
  52:              * Set pgoff according to addr for anon_vma.
  53:              */
  54:             pgoff = addr >> PAGE_SHIFT;
  55:             break;
  56:         default:
  57:             return -EINVAL;
  58:         }
  59:     }
  有后备文件 无后备文件
共享映射 共享文件映射 共享匿名映射
私有映射 私有文件映射 私有匿名映射

 

   1: unsigned long mmap_region(struct file *file, unsigned long addr,
   2:               unsigned long len, unsigned long flags,
   3:               vm_flags_t vm_flags, unsigned long pgoff)
   4: {
   5: ******
   6: if (file) {
   7:         error = -EINVAL;
   8:         if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
   9:             goto free_vma;
  10:         if (vm_flags & VM_DENYWRITE) {
  11:             error = deny_write_access(file);
  12:             if (error)
  13:                 goto free_vma;
  14:             correct_wcount = 1;
  15:         }
  16:         vma->vm_file = file;
  17:         get_file(file);
  18:         error = file->f_op->mmap(file, vma);
  19:         if (error)
  20:             goto unmap_and_free_vma;
  21:         if (vm_flags & VM_EXECUTABLE)
  22:             added_exe_file_vma(mm);
  23:  
  24:         /* Can addr have changed??
  25:          *
  26:          * Answer: Yes, several device drivers can do it in their
  27:          *         f_op->mmap method. -DaveM
  28:          */
  29:         addr = vma->vm_start;
  30:         pgoff = vma->vm_pgoff;
  31:         vm_flags = vma->vm_flags;
  32:     } else if (vm_flags & VM_SHARED) {
  33:         error = shmem_zero_setup(vma);
  34:         if (error)
  35:             goto free_vma;
  36:     }
  37:  
  38: ******
  39: }

调用file->f_op->mmap(file,vma)

对于不同类型的文件系统或者是设备驱动,都定义了各自的mmap函数,我们看一下典型的ext2文件系统:

   1: /*
   2:  * We have mostly NULL's here: the current defaults are ok for
   3:  * the ext2 filesystem.
   4:  */
   5: const struct file_operations ext2_file_operations = {
   6:     .llseek        = generic_file_llseek,
   7:     .read        = do_sync_read,
   8:     .write        = do_sync_write,
   9:     .aio_read    = generic_file_aio_read,
  10:     .aio_write    = generic_file_aio_write,
  11:     .unlocked_ioctl = ext2_ioctl,
  12: #ifdef CONFIG_COMPAT
  13:     .compat_ioctl    = ext2_compat_ioctl,
  14: #endif
  15:     .mmap        = generic_file_mmap,
  16:     .open        = dquot_file_open,
  17:     .release    = ext2_release_file,
  18:     .fsync        = ext2_fsync,
  19:     .splice_read    = generic_file_splice_read,
  20:     .splice_write    = generic_file_splice_write,
  21: };
   1:  
   2: /* This is used for a general mmap of a disk file */
   3:  
   4: int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
   5: {
   6:     struct address_space *mapping = file->f_mapping;
   7:  
   8:     if (!mapping->a_ops->readpage)
   9:         return -ENOEXEC;
  10:     file_accessed(file);
  11:     vma->vm_ops = &generic_file_vm_ops;
  12:     vma->vm_flags |= VM_CAN_NONLINEAR;
  13:     return 0;
  14: }

file_accessed更新时间戳.

   1: const struct vm_operations_struct generic_file_vm_ops = {
   2:     .fault        = filemap_fault,
   3: };
   1: /**
   2:  * filemap_fault - read in file data for page fault handling
   3:  * @vma:    vma in which the fault was taken
   4:  * @vmf:    struct vm_fault containing details of the fault
   5:  *
   6:  * filemap_fault() is invoked via the vma operations vector for a
   7:  * mapped memory region to read in file data during a page fault.
   8:  *
   9:  * The goto's are kind of ugly, but this streamlines the normal case of having
  10:  * it in the page cache, and handles the special cases reasonably without
  11:  * having a lot of duplicated code.
  12:  */
  13: int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  14: {
  15:     int error;
  16:     struct file *file = vma->vm_file;
  17:     struct address_space *mapping = file->f_mapping;
  18:     struct file_ra_state *ra = &file->f_ra;
  19:     struct inode *inode = mapping->host;
  20:     pgoff_t offset = vmf->pgoff;
  21:     struct page *page;
  22:     pgoff_t size;
  23:     int ret = 0;
  24:  
  25:     size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  26:     if (offset >= size)
  27:         return VM_FAULT_SIGBUS;
  28:  
  29:     /*
  30:      * Do we have something in the page cache already?
  31:      */
  32:     page = find_get_page(mapping, offset);
  33:     if (likely(page)) {
  34:         /*
  35:          * We found the page, so try async readahead before
  36:          * waiting for the lock.
  37:          */
  38:         do_async_mmap_readahead(vma, ra, file, page, offset);
  39:     } else {
  40:         /* No page in the page cache at all */
  41:         do_sync_mmap_readahead(vma, ra, file, offset);
  42:         count_vm_event(PGMAJFAULT);
  43:         mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  44:         ret = VM_FAULT_MAJOR;
  45: retry_find:
  46:         page = find_get_page(mapping, offset);
  47:         if (!page)
  48:             goto no_cached_page;
  49:     }
  50:  
  51:     if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
  52:         page_cache_release(page);
  53:         return ret | VM_FAULT_RETRY;
  54:     }
  55:  
  56:     /* Did it get truncated? */
  57:     if (unlikely(page->mapping != mapping)) {
  58:         unlock_page(page);
  59:         put_page(page);
  60:         goto retry_find;
  61:     }
  62:     VM_BUG_ON(page->index != offset);
  63:  
  64:     /*
  65:      * We have a locked page in the page cache, now we need to check
  66:      * that it's up-to-date. If not, it is going to be due to an error.
  67:      */
  68:     if (unlikely(!PageUptodate(page)))
  69:         goto page_not_uptodate;
  70:  
  71:     /*
  72:      * Found the page and have a reference on it.
  73:      * We must recheck i_size under page lock.
  74:      */
  75:     size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  76:     if (unlikely(offset >= size)) {
  77:         unlock_page(page);
  78:         page_cache_release(page);
  79:         return VM_FAULT_SIGBUS;
  80:     }
  81:  
  82:     vmf->page = page;
  83:     return ret | VM_FAULT_LOCKED;
  84:  
  85: no_cached_page:
  86:     /*
  87:      * We're only likely to ever get here if MADV_RANDOM is in
  88:      * effect.
  89:      */
  90:     error = page_cache_read(file, offset);
  91:  
  92:     /*
  93:      * The page we want has now been added to the page cache.
  94:      * In the unlikely event that someone removed it in the
  95:      * meantime, we'll just come back here and read it again.
  96:      */
  97:     if (error >= 0)
  98:         goto retry_find;
  99:  
 100:     /*
 101:      * An error return from page_cache_read can result if the
 102:      * system is low on memory, or a problem occurs while trying
 103:      * to schedule I/O.
 104:      */
 105:     if (error == -ENOMEM)
 106:         return VM_FAULT_OOM;
 107:     return VM_FAULT_SIGBUS;
 108:  
 109: page_not_uptodate:
 110:     /*
 111:      * Umm, take care of errors if the page isn't up-to-date.
 112:      * Try to re-read it _once_. We do this synchronously,
 113:      * because there really aren't any performance issues here
 114:      * and we need to check for errors.
 115:      */
 116:     ClearPageError(page);
 117:     error = mapping->a_ops->readpage(file, page);
 118:     if (!error) {
 119:         wait_on_page_locked(page);
 120:         if (!PageUptodate(page))
 121:             error = -EIO;
 122:     }
 123:     page_cache_release(page);
 124:  
 125:     if (!error || error == AOP_TRUNCATED_PAGE)
 126:         goto retry_find;
 127:  
 128:     /* Things didn't work out. Return zero to tell the mm layer so. */
 129:     shrink_readahead_size_eio(file, ra);
 130:     return VM_FAULT_SIGBUS;
 131: }
 132: EXPORT_SYMBOL(filemap_fault);

函数filemap_fault是文件映射的精华,代表着当进程读取映射的地址范围中的某个地址时,如果发生缺页异常,就由该函数负责从文件中读取真实的内容。

 

如果请求的页不在address_space->page_tree中,那么需要重新从文件中读取:

   1: /**
   2:  * page_cache_read - adds requested page to the page cache if not already there
   3:  * @file:    file to read
   4:  * @offset:    page index
   5:  *
   6:  * This adds the requested page to the page cache if it isn't already there,
   7:  * and schedules an I/O to read in its contents from disk.
   8:  */
   9: static int page_cache_read(struct file *file, pgoff_t offset)
  10: {
  11:     struct address_space *mapping = file->f_mapping;
  12:     struct page *page; 
  13:     int ret;
  14:  
  15:     do {
  16:         page = page_cache_alloc_cold(mapping);
  17:         if (!page)
  18:             return -ENOMEM;
  19:  
  20:         ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
  21:         if (ret == 0)
  22:             ret = mapping->a_ops->readpage(file, page);
  23:         else if (ret == -EEXIST)
  24:             ret = 0; /* losing race to add is OK */
  25:  
  26:         page_cache_release(page);
  27:  
  28:     } while (ret == AOP_TRUNCATED_PAGE);
  29:         
  30:     return ret;
  31: }

 

因为每个inode都只有一个address_space结构体成员,所以所有映射到该inode的内存页面都被维护在page_tree中,这样就可以保证多个进程之间的共享文件机制了,实际上是共享了这些页面。

 

但是,对于每个进程都采用私有映射的文件,是怎么处理的呢?

 

私有映射的文件,会采用写时复制的机制,这也就是为什么说

MAP_PRIVATE

用于创建一个与数据源分离的私有映射,对区域的写入操作不影响数据源文件中的内容

也保证了多个进程对文件的操作上互相独立。

可以预见,是没有办法让两个进程同时以写方式,独立地打开同一个文件的内存映射,否则会产生不一致现象。

 


对于采用共享方式映射的VMA,如果没有后备文件,就采用共享内存的方式映射:

   1: /**
   2:  * shmem_zero_setup - setup a shared anonymous mapping
   3:  * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
   4:  */
   5: int shmem_zero_setup(struct vm_area_struct *vma)
   6: {
   7:     struct file *file;
   8:     loff_t size = vma->vm_end - vma->vm_start;
   9:  
  10:     file = shmem_file_setup("dev/zero", size, vma->vm_flags);
  11:     if (IS_ERR(file))
  12:         return PTR_ERR(file);
  13:  
  14:     if (vma->vm_file)
  15:         fput(vma->vm_file);
  16:     vma->vm_file = file;
  17:     vma->vm_ops = &shmem_vm_ops;
  18:     vma->vm_flags |= VM_CAN_NONLINEAR;
  19:     return 0;
  20: }

其实是使用设备文件dev/zero来初始化这片内存区域。

   1: static const struct vm_operations_struct shmem_vm_ops = {
   2:     .fault        = shmem_fault,
   3: #ifdef CONFIG_NUMA
   4:     .set_policy     = shmem_set_policy,
   5:     .get_policy     = shmem_get_policy,
   6: #endif
   7: };

并且采用 shmem_fault来处理缺页异常

   1: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
   2: {
   3:     struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
   4:     int error;
   5:     int ret;
   6:  
   7:     if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
   8:         return VM_FAULT_SIGBUS;
   9:  
  10:     error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
  11:     if (error)
  12:         return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
  13:     if (ret & VM_FAULT_MAJOR) {
  14:         count_vm_event(PGMAJFAULT);
  15:         mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
  16:     }
  17:     return ret | VM_FAULT_LOCKED;
  18: }
posted @ 2014-01-20 17:51  Daniel King  阅读(1231)  评论(0编辑  收藏  举报