devmem读写物理内存和devkmem读取内核虚拟内存【转】
转自:https://www.cnblogs.com/arnoldlu/p/10721614.html
关键词:/dev/mem、/dev/kmem、mmap、__va、__pa、remap_pfn_range等等。
在日常工作中常有直接操作寄存器或者某一物理地址的需求,busybox中提供了devmem。通过它可以读写物理内存。
它的实现借助mmap和/dev/mem,通过mmap将/dev/mem物理地址映射到用户空间,devmem就可以像操作虚拟地址一样进行读写。
hexdump同样也可以类似devmem的功能。
如果需要在用户空间获取内核某个变量值,可以使用devkmem通过/dev/kmem进行。
下面分别介绍这三种工具。
1. devmem操作物理地址,它是如何做到的?
用户空间是无法直接操作物理地址的;但是日常工作中常需要对某一物理地址进行读写,尤其是寄存器。
devmem可以实现这个功能。那么devmem做了什么?/dev/mem在内核中优势如何实现的呢?
1.1 devmem工具使用
devmem使用介绍如下:
BusyBox v1.27.2 (2019-04-16 17:00:28 CST) multi-call binary. Usage: devmem ADDRESS [WIDTH [VALUE]] Read/write from physical address ADDRESS Address to act upon WIDTH Width (8/16/...) VALUE Data to be written
devmem的能力有限,只能处理最大64字节的数目。
下面向0xfc20700这个地址写入32位数据0x12345678:
devmem 0xfc20700 32 0x12345678
然后从0xfc20700读取进行验证。
devmem 0xfc20700 32 0x12345678
1.2 devmem工具分析
从下面的代码可知,devmem解析参数,然后将地址转换成页面对齐的地址。mmap将/dev/mem的输入地址偏移的页面映射到用户空间,然后读取数值。
int devmem_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int devmem_main(int argc UNUSED_PARAM, char **argv) { void *map_base, *virt_addr; uint64_t read_result; uint64_t writeval = writeval; /* for compiler */ off_t target; unsigned page_size, mapped_size, offset_in_page; int fd; unsigned width = 8 * sizeof(int); /* ADDRESS */ if (!argv[1]) bb_show_usage(); errno = 0; target = bb_strtoull(argv[1], NULL, 0); /* allows hex, oct etc */---------------第一个参数是地址 /* WIDTH */ if (argv[2]) {------------------------------------------------------------------第二个参数,在写的情况下,需要知道写数据的位宽。 if (isdigit(argv[2][0]) || argv[2][1]) width = xatou(argv[2]); else { static const char bhwl[] ALIGN1 = "bhwl"; static const uint8_t sizes[] ALIGN1 = { 8 * sizeof(char), 8 * sizeof(short), 8 * sizeof(int), 8 * sizeof(long), 0 /* bad */ }; width = strchrnul(bhwl, (argv[2][0] | 0x20)) - bhwl; width = sizes[width]; } /* VALUE */ if (argv[3])-----------------------------------------------------------------第三个参数,待写入数值。 writeval = bb_strtoull(argv[3], NULL, 0); } else { /* argv[2] == NULL */ /* make argv[3] to be a valid thing to fetch */ argv--; } if (errno) bb_show_usage(); /* one of bb_strtouXX failed */ fd = xopen("/dev/mem", argv[3] ? (O_RDWR | O_SYNC) : (O_RDONLY | O_SYNC));-------根据第三个参数确定是以只读形式打开,还是以读写形式打开。/dev/mem代表整个内核空间。 mapped_size = page_size = getpagesize(); offset_in_page = (unsigned)target & (page_size - 1);-----------------------------对地址进行也对齐。 if (offset_in_page + width > page_size) {----------------------------------------如果跨页,则mapped_size变成两个页面。 /* This access spans pages. * Must map two pages to make it possible: */ mapped_size *= 2; } map_base = mmap(NULL, mapped_size, argv[3] ? (PROT_READ | PROT_WRITE) : PROT_READ, MAP_SHARED, fd, target & ~(off_t)(page_size - 1));---------------------------------------将/dev/mem文件的从target的页对齐偏移开始,映射mapped_size块大小内存。映射结果是map_base。 if (map_base == MAP_FAILED) bb_perror_msg_and_die("mmap"); // printf("Memory mapped at address %p.\n", map_base); virt_addr = (char*)map_base + offset_in_page; if (!argv[3]) { switch (width) { case 8: read_result = *(volatile uint8_t*)virt_addr; break; case 16: read_result = *(volatile uint16_t*)virt_addr; break; case 32: read_result = *(volatile uint32_t*)virt_addr; break; case 64: read_result = *(volatile uint64_t*)virt_addr; break; default: bb_error_msg_and_die("bad width"); } // printf("Value at address 0x%"OFF_FMT"X (%p): 0x%llX\n", // target, virt_addr, // (unsigned long long)read_result); /* Zero-padded output shows the width of access just done */ printf("0x%0*llX\n", (width >> 2), (unsigned long long)read_result);------------读取数据并打印。 } else { switch (width) { case 8: *(volatile uint8_t*)virt_addr = writeval; // read_result = *(volatile uint8_t*)virt_addr; break; case 16: *(volatile uint16_t*)virt_addr = writeval; // read_result = *(volatile uint16_t*)virt_addr; break; case 32: *(volatile uint32_t*)virt_addr = writeval; // read_result = *(volatile uint32_t*)virt_addr; break; case 64: *(volatile uint64_t*)virt_addr = writeval; // read_result = *(volatile uint64_t*)virt_addr; break; default: bb_error_msg_and_die("bad width"); } // printf("Written 0x%llX; readback 0x%llX\n", // (unsigned long long)writeval, // (unsigned long long)read_result); } if (ENABLE_FEATURE_CLEAN_UP) { if (munmap(map_base, mapped_size) == -1) bb_perror_msg_and_die("munmap"); close(fd); } return EXIT_SUCCESS; }
1.3 /dev/mem是如何实现物理地址读写?
/dev/mem在chr_dev_init()中创建,需要创建的节点在devlist[]中。
static const struct memdev { const char *name; umode_t mode; const struct file_operations *fops; fmode_t fmode; } devlist[] = { #ifdef CONFIG_DEVMEM [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET }, #endif #ifdef CONFIG_DEVKMEM [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET }, #endif... };
其中mem_fops对应/dev/mem节点的操作函数。
static const struct file_operations __maybe_unused mem_fops = { .llseek = memory_lseek, .read = read_mem,---------------------------------将/dev/mem内存读出。 .write = write_mem,-------------------------------直接对/dev/mem进行读写。 .mmap = mmap_mem,---------------------------------对/dev/mem进行mmap映射。 .open = open_mem,---------------------------------主要检查权限是否满足CAP_SYS_RAWIO。 #ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, .mmap_capabilities = memory_mmap_capabilities, #endif }; static int open_port(struct inode *inode, struct file *filp) { return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } #define open_mem open_port
1.3.1 mmap
mmap_mem()是mmap()内存映射的执行者,通过将/dev/mem对应的物理地址映射到用户空间虚拟地址。
static int mmap_mem(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT; /* It's illegal to wrap around the end of the physical address space. */ if (offset + (phys_addr_t)size - 1 < offset)-----------------------------------检查是否存在地址回绕情况,即offset+size超出size_t范围。 return -EINVAL; if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))return -EINVAL;-----------检查vm_pgoff+size是否超出物理地址限制。 if (!private_mapping_ok(vma)) return -ENOSYS; if (!range_is_allowed(vma->vm_pgoff, size)) return -EPERM; if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size, &vma->vm_page_prot)) return -EINVAL; vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff, size, vma->vm_page_prot);---------------------------------------将vm_pgoff+size区域的内存标记为uncacheable。 vma->vm_ops = &mmap_mem_ops; /* Remap-pfn-range will mark the range VM_IO */ if (remap_pfn_range(vma,---------------------------------------------将内核中vma->vm_pgoff对应的size个页面,映射到vma区域,返回的虚拟空间起始地址是vma->vm_start。 vma->vm_start, vma->vm_pgoff, size, vma->vm_page_prot)) { return -EAGAIN; } return 0; }
通过/dev/mem映射的内存是非cache的:
static pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { #ifdef pgprot_noncached phys_addr_t offset = pfn << PAGE_SHIFT; if (uncached_access(file, offset)) { return pgprot_noncached(vma_prot);-------------------------------将/dev/mem设备映射的页面属性设置为Non-Cached。 } #endif return vma_prot; }
remap_pfn_range()将一块内核物理连续内存映射到用户空间,虚拟内存有如下属性VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP,并且页面时Non-Cached。
对所有内存建立页表项,vma的prot变成pteval。
/** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target user address to start at * @pfn: physical address of kernel memory * @size: size of map area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; unsigned long remap_pfn = pfn; int err; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages------------------表示此块内存映射时基于I/O的。 * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just-------表示一段连续页面映射,没有struct page来管理。 * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap().------------表示此段mmap内存,不能通过mremap()来调整大小、位置。 * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off.--------在产生coredump的时候,不对此vma进行dump。 * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr);---------------------------------------找到此进程的mm,根据addr找到对应的pgd。 flush_cache_range(vma, addr, end);--------------------------------刷addr到end区间的TLB。 do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot);--------------------逐级创建页表remap_pud_range()->remap_pmd_range()->remap_pte_range(),最终prot变成页表项的pteval一部分。 if (err) break; } while (pgd++, addr = next, addr != end);------------------------遍历addr到end之间内存,对所有页面建立页表。 if (err) untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); return err; }
1.3.2 read和write
下面两个对应read()和write()两个系统调用。
static ssize_t read_mem(struct file *file, char __user *buf, size_t count, loff_t *ppos) { phys_addr_t p = *ppos; ssize_t read, sz; void *ptr; if (p != *ppos) return 0; if (!valid_phys_addr_range(p, count))---------------------------对输入的物理地址+大小进行验证,确保在low_memory范围内。 return -EFAULT; read = 0; ... while (count > 0) { unsigned long remaining; int allowed; sz = size_inside_page(p, count); allowed = page_is_allowed(p >> PAGE_SHIFT); if (!allowed) return -EPERM; if (allowed == 2) { /* Show zeros for restricted memory. */ remaining = clear_user(buf, sz); } else { ptr = xlate_dev_mem_ptr(p);-----------------------------将物理地址转换成虚拟地址,不成功则返回-EFAULT。注意这里的地址通过_va进行转换,只有特定区域的地址才可以使用。 if (!ptr) return -EFAULT; remaining = copy_to_user(buf, ptr, sz);-----------------将物理地址对应内容拷贝到用户空间。 unxlate_dev_mem_ptr(p, ptr); } if (remaining) return -EFAULT; buf += sz; p += sz; count -= sz; read += sz; } *ppos += read; return read; } static ssize_t write_mem(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { phys_addr_t p = *ppos; ssize_t written, sz; unsigned long copied; void *ptr; if (p != *ppos) return -EFBIG; if (!valid_phys_addr_range(p, count))---------------------------确保地址在low_memory范围内。 return -EFAULT; written = 0; #ifdef __ARCH_HAS_NO_PAGE_ZERO_MAPPED /* we don't have page 0 mapped on sparc and m68k.. */ if (p < PAGE_SIZE) { sz = size_inside_page(p, count); /* Hmm. Do something? */ buf += sz; p += sz; count -= sz; written += sz; } #endif while (count > 0) { int allowed; sz = size_inside_page(p, count); allowed = page_is_allowed(p >> PAGE_SHIFT); if (!allowed) return -EPERM; /* Skip actual writing when a page is marked as restricted. */ if (allowed == 1) { /* * On ia64 if a page has been mapped somewhere as * uncached, then it must also be accessed uncached * by the kernel or data corruption may occur. */ ptr = xlate_dev_mem_ptr(p);--------------------------------__va()进行物理地址到虚拟地址的转换。 if (!ptr) { if (written) break; return -EFAULT; } copied = copy_from_user(ptr, buf, sz); unxlate_dev_mem_ptr(p, ptr); if (copied) { written += sz - copied; if (written) break; return -EFAULT; } } buf += sz; p += sz; count -= sz; written += sz; } *ppos += written; return written; }
对比mmap和read()/write()两种方式可知:
- mmap()可以读写的范围更大;read()/write()的范围局限在low_memory。
- mmap()的读写速度更快,操作更方便。
1.4 hexdump使用
devmem一次读写的内容有限,hexdump可以一次dump大量数据。
但是hexdump是通过read()/write()来获取数据,物理地址的范围受到限制。而devmem通过mmap()则没有这些限制。
hexdump -s 0x10000000 -n 256 /dev/mem 10000000 0005 1908 fc11 18ff edf7 03fe e914 020b 10000010 0d00 1fe4 f202 1601 f703 0412 e814 1cfb 10000020 09fc 000b 06eb 07f0 ec12 01e6 11e9 03f7 10000030 1a2d 11eb f700 ece9 eef3 05f7 0009 eb03 10000040 ff1a e50b 1e08 0f16 0cfa 13fb 0b06 0a1b 10000050 0401 fefd fd1e 0b05 f317 f9ea f00a 3ef5 10000060 f118 fe02 f606 0f02 f1ec f4fe 0216 eefb 10000070 0c02 eefd f8ff 06eb 08fc f603 05fb f80e 10000080 f6fb 2503 f207 0a19 12ee fb0d 0512 09f8 10000090 fbfa 1303 f9fe 0dfc f2fa 06fb fef4 04fa 100000a0 2007 170e 1a05 f3f6 0c2d 0601 0f0b 061f 100000b0 1108 0b18 f80d ebef 05f8 f3eb 0207 e8ff 100000c0 fb07 fdea 0efd fb02 0f10 f8f8 f016 f8f2 100000d0 130f 0803 0909 0100 0b03 fc06 0307 1e10 100000e0 011b 2814 f7f3 fc01 f6f9 03ec 0afb ecf1 100000f0 05fb 070a f904 fbf5 f7fa 0304 f502 0d02
2. devkmem读取内核虚拟地址空间数据
某些情况下需要读取内核某个变量的值,这时候可以通过/dev/kmem。
2.1 /dev/kmem
要使用/dev/kmem就需要在内核中打开CONFIG_DEVKMEM,menuconfig路径为:Device Drivers->Character devices->/dev/kmem virtual device support。
static const struct file_operations __maybe_unused kmem_fops = { .llseek = memory_lseek, .read = read_kmem, .write = write_kmem, .mmap = mmap_kmem, .open = open_kmem, #ifndef CONFIG_MMU .get_unmapped_area = get_unmapped_area_mem, .mmap_capabilities = memory_mmap_capabilities, #endif }; static int mmap_kmem(struct file *file, struct vm_area_struct *vma) { unsigned long pfn; /* Turn a kernel-virtual address into a physical page frame */ pfn = __pa((u64)vma->vm_pgoff << PAGE_SHIFT) >> PAGE_SHIFT;------------------将内核虚拟地址通过__pa()转换成物理地址。 if (!pfn_valid(pfn)) return -EIO; vma->vm_pgoff = pfn; return mmap_mem(file, vma); }
read_kmem()和write_kmem()需要对low_memory和high_memory进行区别对待。
对low_memory需要经过xlate_dev_kmem_ptr()后进行读写;对high_memory通过vread()/vwrite()进行读写。
static ssize_t read_kmem(struct file *file, char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; ssize_t low_count, read, sz; char *kbuf; /* k-addr because vread() takes vmlist_lock rwlock */ int err = 0; read = 0; if (p < (unsigned long) high_memory) {---------------------------------------属于low_memory内存处理。 low_count = count; if (count > (unsigned long)high_memory - p) low_count = (unsigned long)high_memory - p;--------------------------计算处于low_memory区域的内存大小。 ... while (low_count > 0) { sz = size_inside_page(p, low_count); kbuf = xlate_dev_kmem_ptr((void *)p); if (!virt_addr_valid(kbuf))------------------------------------------地址在PAGE_OFFSET和high_memory之间。 return -ENXIO; if (copy_to_user(buf, kbuf, sz)) return -EFAULT; buf += sz; p += sz; read += sz; low_count -= sz; count -= sz; } } if (count > 0) {-------------------------------------------------------------如果还有count没处理完,那么就属于high_memory。 kbuf = (char *)__get_free_page(GFP_KERNEL); if (!kbuf) return -ENOMEM; while (count > 0) { sz = size_inside_page(p, count);-------------------------------------如果p+count不跨页,那么sz=count;否则sz只取p所在页面剩余部分大小。这样确保下面的copy_to_user()不跨页。 if (!is_vmalloc_or_module_addr((void *)p)) { err = -ENXIO; break; } sz = vread(kbuf, (char *)p, sz); if (!sz) break; if (copy_to_user(buf, kbuf, sz)) { err = -EFAULT; break; } count -= sz; buf += sz; read += sz; p += sz; } free_page((unsigned long)kbuf); } *ppos = p; return read ? read : err; } static ssize_t do_write_kmem(unsigned long p, const char __user *buf, size_t count, loff_t *ppos) { ssize_t written, sz; unsigned long copied; written = 0; ... while (count > 0) { void *ptr; sz = size_inside_page(p, count); ptr = xlate_dev_kmem_ptr((void *)p); if (!virt_addr_valid(ptr)) return -ENXIO; copied = copy_from_user(ptr, buf, sz); if (copied) { written += sz - copied; if (written) break; return -EFAULT; } buf += sz; p += sz; count -= sz; written += sz; } *ppos += written; return written; } static ssize_t write_kmem(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { unsigned long p = *ppos; ssize_t wrote = 0; ssize_t virtr = 0; char *kbuf; /* k-addr because vwrite() takes vmlist_lock rwlock */ int err = 0; if (p < (unsigned long) high_memory) { unsigned long to_write = min_t(unsigned long, count, (unsigned long)high_memory - p); wrote = do_write_kmem(p, buf, to_write, ppos); if (wrote != to_write) return wrote; p += wrote; buf += wrote; count -= wrote; } if (count > 0) { kbuf = (char *)__get_free_page(GFP_KERNEL); if (!kbuf) return wrote ? wrote : -ENOMEM; while (count > 0) { unsigned long sz = size_inside_page(p, count); unsigned long n; if (!is_vmalloc_or_module_addr((void *)p)) { err = -ENXIO; break; } n = copy_from_user(kbuf, buf, sz); if (n) { err = -EFAULT; break; } vwrite(kbuf, (char *)p, sz); count -= sz; buf += sz; virtr += sz; p += sz; } free_page((unsigned long)kbuf); } *ppos = p; return virtr + wrote ? : err; }
2.2 devkmem
类似于devmem相对于/dev/mem,devkmem通过将/dev/kmem映射到用户空间,然后读取内容。
#include <stdio.h> #include <stdlib.h> #include <fcntl.h> #include <string.h> #include <errno.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/mman.h> #define DEVKMEM "/dev/kmem" #define PAGE_SIZE 0x1000 #define PAGE_MASK (~(PAGE_SIZE-1)) int main(int argc, char* argv[]) { int fd; char *mbase; char read_buf[10]; unsigned int varAddr, regAddr; varAddr = strtoul(argv[1], 0, 16); unsigned int ptr = varAddr & ~(PAGE_MASK); fd = open(DEVKMEM, O_RDONLY); if (fd == -1) { perror("open"); exit(-1); } mbase = mmap(0,PAGE_SIZE,PROT_READ,MAP_SHARED,fd, (varAddr & PAGE_MASK)); if (mbase == MAP_FAILED) { printf("map failed %s\n",strerror(errno)); } printf("varAddr = 0x%X \n", varAddr); printf("mapbase = 0x%X \n", (unsigned int)mbase); printf("value = 0x%X \n",*(unsigned int*)(mbase+ptr)); close(fd); munmap(mbase,PAGE_SIZE); return 0; }
2.3 devkmem使用
由于devkmem需要输入地址,但由于是虚拟地址,完全地址是没有意义的。
需要通过/proc/kallsyms根据符号找到对应的内核虚拟地址,然后再通过devkmem查看其值。
比如想查看sysctl_sched_rt_runtime的值,首先查看其在内核的虚拟地址:
cat /proc/kallsyms | grep sysctl_sched_rt_runtime 808eb544 D sysctl_sched_rt_runtime
然后查看虚拟地址的值:
./devkmem 808eb544 varAddr = 0x808EB544 mapbase = 0x2ABFB000 value = 0xE7EF0
换算成10进制就是950000.
那么这个值对不对呢?cat /proc/sys/kernel/sched_rt_runtime_us表明结果正确。
改进点:
1.直接输入符号,显示所有符号的值。
2.不同输出格式,16进制、10进制等等。
参考文档:
《使用/dev/kmem读取内核变量的值》
《/dev/mem可没那么简单》
remap_pfn_range