Linux内核最新的连续内存分配器(CMA)——避免预留大块内存【转】
转自:https://blog.csdn.net/21cnbao/article/details/7309757
在我们使用ARM等嵌入式Linux系统的时候,一个头疼的问题是GPU,Camera,HDMI等都需要预留大量连续内存,这部分内存平时不用,但是一般的做法又必须先预留着。目前,Marek Szyprowski和Michal Nazarewicz实现了一套全新的Contiguous Memory Allocator。通过这套机制,我们可以做到不预留内存,这些内存平时是可用的,只有当需要的时候才被分配给Camera,HDMI等设备。下面分析它的基本代码流程。
声明连续内存
内核启动过程中arch/arm/mm/init.c中的arm_memblock_init()会调用dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
该函数位于:drivers/base/dma-contiguous.c
-
/**
-
* dma_contiguous_reserve() - reserve area for contiguous memory handling
-
* @limit: End address of the reserved memory (optional, 0 for any).
-
*
-
* This function reserves memory from early allocator. It should be
-
* called by arch specific code once the early allocator (memblock or bootmem)
-
* has been activated and all other subsystems have already allocated/reserved
-
* memory.
-
*/
-
void __init dma_contiguous_reserve(phys_addr_t limit)
-
{
-
unsigned long selected_size = 0;
-
-
pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit);
-
-
if (size_cmdline != -1) {
-
selected_size = size_cmdline;
-
} else {
-
#ifdef CONFIG_CMA_SIZE_SEL_MBYTES
-
selected_size = size_bytes;
-
-
selected_size = cma_early_percent_memory();
-
-
selected_size = min(size_bytes, cma_early_percent_memory());
-
-
selected_size = max(size_bytes, cma_early_percent_memory());
-
-
}
-
-
if (selected_size) {
-
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
-
selected_size / SZ_1M);
-
-
dma_declare_contiguous(NULL, selected_size, 0, limit);
-
}
-
};
其中的size_bytes定义为:
static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M; 默认情况下,CMA_SIZE_MBYTES会被定义为16MB,来源于CONFIG_CMA_SIZE_MBYTES=16
->
-
int __init dma_declare_contiguous(struct device *dev, unsigned long size,
-
phys_addr_t base, phys_addr_t limit)
-
{
-
...
-
/* Reserve memory */
-
if (base) {
-
if (memblock_is_region_reserved(base, size) ||
-
memblock_reserve(base, size) < 0) {
-
base = -EBUSY;
-
goto err;
-
}
-
} else {
-
/*
-
* Use __memblock_alloc_base() since
-
* memblock_alloc_base() panic()s.
-
*/
-
phys_addr_t addr = __memblock_alloc_base(size, alignment, limit);
-
if (!addr) {
-
base = -ENOMEM;
-
goto err;
-
} else if (addr + size > ~(unsigned long)0) {
-
memblock_free(addr, size);
-
base = -EINVAL;
-
base = -EINVAL;
-
goto err;
-
} else {
-
base = addr;
-
}
-
}
-
-
/*
-
* Each reserved area must be initialised later, when more kernel
-
* subsystems (like slab allocator) are available.
-
*/
-
r->start = base;
-
r->size = size;
-
r->dev = dev;
-
cma_reserved_count++;
-
pr_info("CMA: reserved %ld MiB at %08lx\n", size / SZ_1M,
-
(unsigned long)base);
-
-
/* Architecture specific contiguous memory fixup. */
-
dma_contiguous_early_fixup(base, size);
-
return 0;
-
err:
-
pr_err("CMA: failed to reserve %ld MiB\n", size / SZ_1M);
-
return base;
-
}
由此可见,连续内存区域也是在内核启动的早期,通过__memblock_alloc_base()拿到的。
另外:
drivers/base/dma-contiguous.c里面的core_initcall()会导致cma_init_reserved_areas()被调用:
-
static int __init cma_init_reserved_areas(void)
-
{
-
struct cma_reserved *r = cma_reserved;
-
unsigned i = cma_reserved_count;
-
-
pr_debug("%s()\n", __func__);
-
-
for (; i; --i, ++r) {
-
struct cma *cma;
-
cma = cma_create_area(PFN_DOWN(r->start),
-
r->size >> PAGE_SHIFT);
-
if (!IS_ERR(cma))
-
dev_set_cma_area(r->dev, cma);
-
}
-
return 0;
-
}
-
core_initcall(cma_init_reserved_areas);
cma_create_area()会调用cma_activate_area(),cma_activate_area()函数则会针对每个page调用:
init_cma_reserved_pageblock(pfn_to_page(base_pfn));
这个函数则会通过set_pageblock_migratetype(page, MIGRATE_CMA)将页设置为MIGRATE_CMA类型的:
-
-
/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
-
void __init init_cma_reserved_pageblock(struct page *page)
-
{
-
unsigned i = pageblock_nr_pages;
-
struct page *p = page;
-
-
do {
-
__ClearPageReserved(p);
-
set_page_count(p, 0);
-
} while (++p, --i);
-
-
set_page_refcounted(page);
-
set_pageblock_migratetype(page, MIGRATE_CMA);
-
__free_pages(page, pageblock_order);
-
totalram_pages += pageblock_nr_pages;
-
}
-
同时其中调用的__free_pages(page, pageblock_order);最终会调用到__free_one_page(page, zone, order, migratetype);
相关的page会被加到MIGRATE_CMA的free_list上面去:
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
申请连续内存
申请连续内存仍然使用标准的arch/arm/mm/dma-mapping.c中定义的dma_alloc_coherent()和dma_alloc_writecombine(),这二者会间接调用drivers/base/dma-contiguous.c中的
-
struct page *dma_alloc_from_contiguous(struct device *dev, int count,
-
unsigned int align)
->
-
struct page *dma_alloc_from_contiguous(struct device *dev, int count,
-
unsigned int align)
-
{
-
...
-
-
for (;;) {
-
pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count,
-
start, count, mask);
-
if (pageno >= cma->count) {
-
ret = -ENOMEM;
-
goto error;
-
}
-
-
pfn = cma->base_pfn + pageno;
-
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA);
-
if (ret == 0) {
-
bitmap_set(cma->bitmap, pageno, count);
-
break;
-
} else if (ret != -EBUSY) {
-
goto error;
-
}
-
pr_debug("%s(): memory range at %p is busy, retrying\n",
-
__func__, pfn_to_page(pfn));
-
/* try again with a bit different memory target */
-
start = pageno + mask + 1;
-
}
-
...
-
-
}
-
->
int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
需要隔离page,隔离page的作用通过代码的注释可以体现:
-
/*
-
* What we do here is we mark all pageblocks in range as
-
* MIGRATE_ISOLATE. Because of the way page allocator work, we
-
* align the range to MAX_ORDER pages so that page allocator
-
* won't try to merge buddies from different pageblocks and
-
* change MIGRATE_ISOLATE to some other migration type.
-
*
-
* Once the pageblocks are marked as MIGRATE_ISOLATE, we
-
* migrate the pages from an unaligned range (ie. pages that
-
* we are interested in). This will put all the pages in
-
* range back to page allocator as MIGRATE_ISOLATE.
-
*
-
* When this is done, we take the pages in range from page
-
* allocator removing them from the buddy system. This way
-
* page allocator will never consider using them.
-
*
-
* This lets us mark the pageblocks back as
-
* MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
-
* MAX_ORDER aligned range but not in the unaligned, original
-
* range are put back to page allocator so that buddy can use
-
* them.
-
*/
-
-
ret = start_isolate_page_range(pfn_align_to_maxpage_down(start),
-
pfn_align_to_maxpage_up(end),
-
migratetype);
简单地说,就是把相关的page标记为MIGRATE_ISOLATE,这样buddy系统就不会再使用他们。
-
/*
-
* start_isolate_page_range() -- make page-allocation-type of range of pages
-
* to be MIGRATE_ISOLATE.
-
* @start_pfn: The lower PFN of the range to be isolated.
-
* @end_pfn: The upper PFN of the range to be isolated.
-
* @migratetype: migrate type to set in error recovery.
-
*
-
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
-
* the range will never be allocated. Any free pages and pages freed in the
-
* future will not be allocated again.
-
*
-
* start_pfn/end_pfn must be aligned to pageblock_order.
-
* Returns 0 on success and -EBUSY if any part of range cannot be isolated.
-
*/
-
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-
unsigned migratetype)
-
{
-
unsigned long pfn;
-
unsigned long undo_pfn;
-
struct page *page;
-
-
BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
-
BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
-
-
for (pfn = start_pfn;
-
pfn < end_pfn;
-
pfn += pageblock_nr_pages) {
-
page = __first_valid_page(pfn, pageblock_nr_pages);
-
if (page && set_migratetype_isolate(page)) {
-
undo_pfn = pfn;
-
goto undo;
-
}
-
}
-
return 0;
-
undo:
-
for (pfn = start_pfn;
-
pfn < undo_pfn;
-
pfn += pageblock_nr_pages)
-
unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
-
-
return -EBUSY;
-
}
接下来调用__alloc_contig_migrate_range()进行页面隔离和迁移:
-
static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
-
{
-
/* This function is based on compact_zone() from compaction.c. */
-
-
unsigned long pfn = start;
-
unsigned int tries = 0;
-
int ret = 0;
-
-
struct compact_control cc = {
-
.nr_migratepages = 0,
-
.order = -1,
-
.zone = page_zone(pfn_to_page(start)),
-
.sync = true,
-
};
-
INIT_LIST_HEAD(&cc.migratepages);
-
-
migrate_prep_local();
-
-
while (pfn < end || !list_empty(&cc.migratepages)) {
-
if (fatal_signal_pending(current)) {
-
ret = -EINTR;
-
break;
-
}
-
-
if (list_empty(&cc.migratepages)) {
-
cc.nr_migratepages = 0;
-
pfn = isolate_migratepages_range(cc.zone, &cc,
-
pfn, end);
-
if (!pfn) {
-
ret = -EINTR;
-
break;
-
}
-
tries = 0;
-
} else if (++tries == 5) {
-
ret = ret < 0 ? ret : -EBUSY;
-
break;
-
}
-
-
ret = migrate_pages(&cc.migratepages,
-
__alloc_contig_migrate_alloc,
-
0, false, true);
-
}
-
-
putback_lru_pages(&cc.migratepages);
-
return ret > 0 ? 0 : ret;
-
}
其中的函数migrate_pages()会完成页面的迁移,迁移过程中通过传入的__alloc_contig_migrate_alloc()申请新的page,并将老的page付给新的page:
-
int migrate_pages(struct list_head *from,
-
new_page_t get_new_page, unsigned long private, bool offlining,
-
bool sync)
-
{
-
int retry = 1;
-
int nr_failed = 0;
-
int pass = 0;
-
struct page *page;
-
struct page *page2;
-
int swapwrite = current->flags & PF_SWAPWRITE;
-
int rc;
-
-
if (!swapwrite)
-
current->flags |= PF_SWAPWRITE;
-
-
for(pass = 0; pass < 10 && retry; pass++) {
-
retry = 0;
-
-
list_for_each_entry_safe(page, page2, from, lru) {
-
cond_resched();
-
-
rc = unmap_and_move(get_new_page, private,
-
page, pass > 2, offlining,
-
sync);
-
-
switch(rc) {
-
case -ENOMEM:
-
goto out;
-
case -EAGAIN:
-
retry++;
-
break;
-
case 0:
-
break;
-
default:
-
/* Permanent failure */
-
nr_failed++;
-
break;
-
}
-
}
-
}
-
rc = 0;
-
...
-
}
其中的unmap_and_move()函数较为关键,它定义在mm/migrate.c中
-
/*
-
* Obtain the lock on page, remove all ptes and migrate the page
-
* to the newly allocated page in newpage.
-
*/
-
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-
struct page *page, int force, bool offlining, bool sync)
-
{
-
int rc = 0;
-
int *result = NULL;
-
struct page *newpage = get_new_page(page, private, &result);
-
int remap_swapcache = 1;
-
int charge = 0;
-
struct mem_cgroup *mem = NULL;
-
struct anon_vma *anon_vma = NULL;
-
-
...
-
-
/* charge against new page */
-
charge = mem_cgroup_prepare_migration(page, newpage, &mem);
-
...
-
-
if (PageWriteback(page)) {
-
if (!force || !sync)
-
goto uncharge;
-
wait_on_page_writeback(page);
-
}
-
/*
-
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
-
* we cannot notice that anon_vma is freed while we migrates a page.
-
* This get_anon_vma() delays freeing anon_vma pointer until the end
-
* of migration. File cache pages are no problem because of page_lock()
-
* File Caches may use write_page() or lock_page() in migration, then,
-
* just care Anon page here.
-
*/
-
if (PageAnon(page)) {
-
/*
-
* Only page_lock_anon_vma() understands the subtleties of
-
* getting a hold on an anon_vma from outside one of its mms.
-
*/
-
anon_vma = page_lock_anon_vma(page);
-
if (anon_vma) {
-
/*
-
* Take a reference count on the anon_vma if the
-
* page is mapped so that it is guaranteed to
-
* exist when the page is remapped later
-
*/
-
get_anon_vma(anon_vma);
-
page_unlock_anon_vma(anon_vma);
-
} else if (PageSwapCache(page)) {
-
/*
-
* We cannot be sure that the anon_vma of an unmapped
-
* swapcache page is safe to use because we don't
-
* know in advance if the VMA that this page belonged
-
* to still exists. If the VMA and others sharing the
-
* data have been freed, then the anon_vma could
-
* already be invalid.
-
*
-
* To avoid this possibility, swapcache pages get
-
* migrated but are not remapped when migration
-
* completes
-
*/
-
remap_swapcache = 0;
-
} else {
-
goto uncharge;
-
}
-
}
-
-
...
-
/* Establish migration ptes or remove ptes */
-
try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
-
-
skip_unmap:
-
if (!page_mapped(page))
-
rc = move_to_new_page(newpage, page, remap_swapcache);
-
-
if (rc && remap_swapcache)
-
remove_migration_ptes(page, page);
-
-
/* Drop an anon_vma reference if we took one */
-
if (anon_vma)
-
drop_anon_vma(anon_vma);
-
-
uncharge:
-
if (!charge)
-
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-
unlock:
-
unlock_page(page);
-
-
move_newpage:
-
...
-
}
通过unmap_and_move(),老的page就被迁移过去新的page。
接下来要回收page,回收page的作用是,不至于因为拿了连续的内存后,系统变得内存饥饿:
->
-
/*
-
* Reclaim enough pages to make sure that contiguous allocation
-
* will not starve the system.
-
*/
-
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
->
-
/*
-
* Trigger memory pressure bump to reclaim some pages in order to be able to
-
* allocate 'count' pages in single page units. Does similar work as
-
*__alloc_pages_slowpath() function.
-
*/
-
static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
-
{
-
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-
struct zonelist *zonelist = node_zonelist(0, gfp_mask);
-
int did_some_progress = 0;
-
int order = 1;
-
unsigned long watermark;
-
-
/*
-
* Increase level of watermarks to force kswapd do his job
-
* to stabilise at new watermark level.
-
*/
-
__update_cma_watermarks(zone, count);
-
-
/* Obey watermarks as if the page was being allocated */
-
watermark = low_wmark_pages(zone) + count;
-
while (!zone_watermark_ok(zone, 0, watermark, 0, 0)) {
-
wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
-
-
did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
-
NULL);
-
if (!did_some_progress) {
-
/* Exhausted what can be done so it's blamo time */
-
out_of_memory(zonelist, gfp_mask, order, NULL);
-
}
-
}
-
-
/* Restore original watermark levels. */
-
__update_cma_watermarks(zone, -count);
-
-
return count;
-
}
释放连续内存
内存释放的时候也比较简单,直接就是:
arch/arm/mm/dma-mapping.c:
void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle)
->
arch/arm/mm/dma-mapping.c:
-
static void __free_from_contiguous(struct device *dev, struct page *page,
-
size_t size)
-
{
-
__dma_remap(page, size, pgprot_kernel);
-
dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
-
}
->
-
bool dma_release_from_contiguous(struct device *dev, struct page *pages,
-
int count)
-
{
-
...
-
free_contig_range(pfn, count);
-
..
-
-
}
->
-
void free_contig_range(unsigned long pfn, unsigned nr_pages)
-
{
-
for (; nr_pages--; ++pfn)
-
__free_page(pfn_to_page(pfn));
-
}
将page交还给buddy。
内核内存分配的migratetype
内核内存分配的时候,带的标志是GFP_,但是GFP_可以转化为migratetype:
-
static inline int allocflags_to_migratetype(gfp_t gfp_flags)
-
{
-
WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
-
-
if (unlikely(page_group_by_mobility_disabled))
-
return MIGRATE_UNMOVABLE;
-
-
/* Group based on mobility */
-
return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
-
((gfp_flags & __GFP_RECLAIMABLE) != 0);
-
}
之后申请内存的时候,会对比迁移类型匹配的free_list:
-
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
-
preferred_zone, migratetype);
另外,笔者也编写了一个测试程序,透过它随时测试CMA的功能:
-
/*
-
* kernel module helper for testing CMA
-
*
-
* Licensed under GPLv2 or later.
-
*/
-
-
-
-
-
-
-
-
-
static struct device *cma_dev;
-
static dma_addr_t dma_phys[CMA_NUM];
-
static void *dma_virt[CMA_NUM];
-
-
/* any read request will free coherent memory, eg.
-
* cat /dev/cma_test
-
*/
-
static ssize_t
-
cma_test_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-
{
-
int i;
-
-
for (i = 0; i < CMA_NUM; i++) {
-
if (dma_virt[i]) {
-
dma_free_coherent(cma_dev, (i + 1) * SZ_1M, dma_virt[i], dma_phys[i]);
-
_dev_info(cma_dev, "free virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);
-
dma_virt[i] = NULL;
-
break;
-
}
-
}
-
return 0;
-
}
-
-
/*
-
* any write request will alloc coherent memory, eg.
-
* echo 0 > /dev/cma_test
-
*/
-
static ssize_t
-
cma_test_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
-
{
-
int i;
-
int ret;
-
-
for (i = 0; i < CMA_NUM; i++) {
-
if (!dma_virt[i]) {
-
dma_virt[i] = dma_alloc_coherent(cma_dev, (i + 1) * SZ_1M, &dma_phys[i], GFP_KERNEL);
-
-
if (dma_virt[i]) {
-
void *p;
-
/* touch every page in the allocated memory */
-
for (p = dma_virt[i]; p < dma_virt[i] + (i + 1) * SZ_1M; p += PAGE_SIZE)
-
*(u32 *)p = 0;
-
-
_dev_info(cma_dev, "alloc virt: %p phys: %p\n", dma_virt[i], (void *)dma_phys[i]);
-
} else {
-
dev_err(cma_dev, "no mem in CMA area\n");
-
ret = -ENOMEM;
-
}
-
break;
-
}
-
}
-
-
return count;
-
}
-
-
static const struct file_operations cma_test_fops = {
-
.owner = THIS_MODULE,
-
.read = cma_test_read,
-
.write = cma_test_write,
-
};
-
-
static struct miscdevice cma_test_misc = {
-
.name = "cma_test",
-
.fops = &cma_test_fops,
-
};
-
-
static int __init cma_test_init(void)
-
{
-
int ret = 0;
-
-
ret = misc_register(&cma_test_misc);
-
if (unlikely(ret)) {
-
pr_err("failed to register cma test misc device!\n");
-
return ret;
-
}
-
cma_dev = cma_test_misc.this_device;
-
cma_dev->coherent_dma_mask = ~0;
-
_dev_info(cma_dev, "registered.\n");
-
-
return ret;
-
}
-
module_init(cma_test_init);
-
-
static void __exit cma_test_exit(void)
-
{
-
misc_deregister(&cma_test_misc);
-
}
-
module_exit(cma_test_exit);
-
-
MODULE_LICENSE("GPL");
-
MODULE_AUTHOR("Barry Song <21cnbao@gmail.com>");
-
MODULE_DESCRIPTION("kernel module to help the test of CMA");
-
MODULE_ALIAS("CMA test");
申请内存:
# echo 0 > /dev/cma_test
释放内存:
# cat /dev/cma_test
参考链接:
[1] http://www.spinics.net/lists/arm-kernel/msg160854.html
[2] http://www.spinics.net/lists/arm-kernel/msg162063.html
[3] http://lwn.net/Articles/447405/