autu NUMA balance
auto numa balance(代码基于kernel 5.3)
auto NUMA 改进了 NUMA 硬件系统中运行应用的性能。
自动化 NUMA 平衡启用时需满足以下两个条件:
# numactl --hardware
显示多个节点,以及# cat /sys/kernel/debug/sched_features
在标识中显示NUMA
应用程序的手动 NUMA 调试将会重载自动化 NUMA 平衡,并禁用周期性的内存空白、NUMA 错误、迁移和以上应用程序的自动化 NUMA 放置。
禁用自动化 NUMA 平衡,请使用以下命令:
# echo 0 > /proc/sys/kernel/numa_balancing
# echo 1 > /proc/sys/kernel/numa_balancing
打开
通常,在程序的线程访问 NUMA 节点上的内存、且此节点位置与线程的位置相同的时候,性能最佳。 auto NUMA banlance会把任务(任务可能是线程或进程)移到与它们需要访问的内存更近的地方,同时也会移动内存应用程序数据,使其更靠近参考这一数据的任务。 以上均在auto NUMA banlance启用时由内核自动完成。
auto numa balance总结
内存跟着cpu走
- 周期性的标记task上部分内存中页表项flags上PROT_NONE bit,并刷出TLB
- 访存时,TLB miss,查页表,pte上看到PROT_NONE发生page fault
- 这时候,能知道了page所在的node,不符合migrate policy则迁移。一般情况是发生page fault所在的内存和task所在的cpu不在同一个NUMA node则迁移。
cpu跟着内存走
- 每一次numa fault时候,按照node和numa_group,统计page的位置关系。把这些信息看作一个采样。
- 算法计算如果迁移有收益,比如简单的讲,task发生过多的numa fault在某个node上(与task在的cpu不是同一个node),则迁移task到该node。
migrate policy
auto numa调用栈
调度器的实现基于两个函数:周期性调度器函数和主调度器函数。这些函数根据现有
进程的优先级分配CPU时间。
周期性调度器在scheduler_tick中实现。如果系统正在活动中,内核会按照频率HZ自动调用该函数 。该函数有下面两个主要任务。
管理内核中与整个系统和各个进程的调度相关的统计量。其执行的主要操作是对各种计数器加1。
激活负责当前进程的调度类的周期性调度方法。
kernel/sched/core.c -scheduler_tick
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
sched_clock_tick();
rq_lock(rq, &rf);
update_rq_clock(rq);
--》 curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
psi_task_tick(rq);
rq_unlock(rq, &rf);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
}
kernel中默认调度器完全公平调度器CFS,所以直接看fair
kernel/sched/fair.c -task_tick_fair
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
//调度器选择下一个要执行的task
//所有的task都是用entity来表示(调度实体);所有的调度实体(entity)都是挂接到红黑树上进行管理,即运行队列是一棵红黑树;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
//开了numabalance这个if条件才会成立
if (static_branch_unlikely(&sched_numa_balancing))
--》 task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
}
kernel/sched/fair.c -task_tick_numa
/*
* Drive the periodic memory faults..
*/
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
//周期性的扫描
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
//添加一个work,其回调函数是task_numa_work,这个函数最主要的工作就是调用change_prot_numa把所有映射到VMA的PTE页表项该为PAGE_NONE,当下次访问这个页时就会发生缺页中断,这样我们就可以在缺页中断中迁移进程和其使用的页
--》 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
task_work_add(curr, work, true);
}
}
}
kernel/sched/fair.c -task_numa_work
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
u64 runtime = p->se.sum_exec_runtime;
struct vm_area_struct *vma;
unsigned long start, end;
unsigned long nr_pte_updates = 0;
long pages, virtpages;
SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
if (!mm->numa_next_scan) {
mm->numa_next_scan = now +
msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
}
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
/*
* Delay this task enough that another task of this mm will likely win
* the next time around.
*/
p->node_stamp += 2 * TICK_NSEC;
//
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
virtpages = pages * 8; /* Scan up to this much virtual space */
if (!pages)
return;
if (!down_read_trylock(&mm->mmap_sem))
return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
start = 0;
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
continue;
}
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
* hinting faults in read-only file-backed mappings or the vdso
* as migrating the pages will be of marginal benefit.
*/
if (!vma->vm_mm ||
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue;
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
continue;
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
end = min(end, vma->vm_end);
//在do-while中修改这个vma所有page
//改pte prot none bit
--》 nr_pte_updates = change_prot_numa(vma, start, end);
/*
* Try to scan sysctl_numa_balancing_size worth of
* hpages that have at least one present PTE that
* is not already pte-numa. If the VMA contains
* areas that are unused or already full of prot_numa
* PTEs, scan up to virtpages, to skip through those
* areas faster.
*/
if (nr_pte_updates)
pages -= (end - start) >> PAGE_SHIFT;
virtpages -= (end - start) >> PAGE_SHIFT;
start = end;
if (pages <= 0 || virtpages <= 0)
goto out;
cond_resched();
} while (end != vma->vm_end);
}
out:
/*
* It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we
* would find the !migratable VMA on the next scan but not reset the
* scanner to the start so check it now.
*/
if (vma)
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
up_read(&mm->mmap_sem);
/*
* Make sure tasks use at least 32x as much time to run other code
* than they used here, to limit NUMA PTE scanning overhead to 3% max.
* Usually update_task_scan_period slows down scanning enough; on an
* overloaded system we need to limit overhead on a per task basis.
*/
if (unlikely(p->se.sum_exec_runtime != runtime)) {
u64 diff = p->se.sum_exec_runtime - runtime;
p->node_stamp += 32 * diff;
}
}
mm/mempolicy.c -change_prot_numa
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
除了在pte flag上设置了prot none bit,还有重要的一步操作:flush掉tlb。
下一次访问,必定产生tlb miss,这时候page walk看页表,发现prot none bit,然后走page fault
mm/memory.c -handle_pte_fault
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
* concurrent faults).
*
* The mmap_sem may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
pte_t entry;
if (unlikely(pmd_none(*vmf->pmd))) {
/*
* Leave __pte_alloc() until later: because vm_ops->fault may
* want to allocate huge page, and if we expose page table
* for an instant, it will be difficult to retract from
* concurrent faults and from rmap lookups.
*/
vmf->pte = NULL;
} else {
/* See comment in pte_alloc_one_map() */
if (pmd_devmap_trans_unstable(vmf->pmd))
return 0;
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
* mmap_sem read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
vmf->orig_pte = *vmf->pte;
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
* CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
* accesses. The code below just needs a consistent view
* for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
if (pte_none(vmf->orig_pte)) {
pte_unmap(vmf->pte);
vmf->pte = NULL;
}
}
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
else
return do_fault(vmf);
}
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
//已经设置了pte上protnone bit为标志,并通过vma flags过滤掉实际上真正的不可读写不可执行的page,区分出NUMA hinting page faults
if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
return do_numa_page(vmf);
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
if (unlikely(!pte_same(*vmf->pte, entry)))
goto unlock;
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (vmf->flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
}
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
mm/memory.c -do_numa_page
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*/
vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto out;
}
/*
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
if (!page) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/* TODO: handle PTE-mapped THP */
if (PageCompound(page)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
* the case where a mapping is writable but the process never writes
* to it but pte_write gets cleared during protection updates and
* pte_dirty has unpredictable behaviour between PTE scan updates,
* background writeback, dirty balancing and application behaviour.
*/
if (!pte_write(pte))
flags |= TNF_NO_GROUP;
/*
* Flag if the page is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
last_cpupid = page_cpupid_last(page);
//page的"flags"里,它用flags的高8位存储了它所属的zone和node。这是通过page flags找到page所属的node的方法:
page_nid = page_to_nid(page);
//page所在和进程所在的core不在一个numa上,需要迁移并且不符合migrate plicy的,返回目的 id target_nid,就是进程所在的id。如果target_nid等于-1,则表示页就在自己的numa节点上不用迁移或者已经符合policy
--》 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
/* Migrate to the requested node */
//通过下面函数将页迁移到自己分配的numa 节点上
--》 migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
page_nid = target_nid;
flags |= TNF_MIGRATED;
} else
flags |= TNF_MIGRATE_FAIL;
out:
//target_nid查找失败了,通过设置到task里的fault再来
if (page_nid != NUMA_NO_NODE)
//页面迁移完成后,再通过task_numa_fault 来迁移task
--》 task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
接下来为了错误统计,按照每个numa node 为单位,计数发生numa page fault的所在的node,如果已经迁移,记录新值
kernel/sched/fair.c -task_numa_fault
/*
* Allocate task-specific structure for placement policy here
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
return;
/* for example, ksmd faulting in a user's mm */
if (!p->mm)
return;
/* Allocate buffer to track faults on a per-node basis */
//分配缓冲区以在每个节点的基础上跟踪故障
if (unlikely(!p->numa_faults)) {
int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
if (!p->numa_faults)
return;
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
* First accesses are treated as private, otherwise consider accesses
* to be private if the accessing pid has not changed
*/
//第一次访问被视为私有,否则如果访问 pid 没有改变,则认为访问是私有的
if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
priv = 1;
} else {
priv = cpupid_match_pid(p, last_cpupid);
if (!priv && !(flags & TNF_NO_GROUP))
task_numa_group(p, last_cpupid, flags, &priv);
}
/*
* If a workload spans multiple NUMA nodes, a shared fault that
* occurs wholly within the set of nodes that the workload is
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
//如果工作负载跨越多个 NUMA 节点,则完全在工作负载正在使用的节点集中发生的共享故障应计为本地。 这允许在工作负载稳定后降低扫描速度。
ng = deref_curr_numa_group(p);
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1;
/*
* Retry to migrate task to preferred node periodically, in case it
* previously failed, or the scheduler moved us.
*/
//定期重试将任务迁移到首选节点
if (time_after(jiffies, p->numa_migrate_retry)) {
--》 task_numa_placement(p);
numa_migrate_preferred(p);
}
if (migrated)
p->numa_pages_migrated += pages;
if (flags & TNF_MIGRATE_FAIL)
p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
kernel/sched/fair.c -task_numa_placement
/*
* Scheduling placement policy hints go here
*/
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = NUMA_NO_NODE;
unsigned long max_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
unsigned long total_faults;
u64 runtime, period;
spinlock_t *group_lock = NULL;
struct numa_group *ng;
/*
* The p->mm->numa_scan_seq field gets updated without
* exclusive access. Use READ_ONCE() here to ensure
* that the field is read in a single access:
*/
seq = READ_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
total_faults = p->numa_faults_locality[0] +
p->numa_faults_locality[1];
runtime = numa_get_avg_runtime(p, &period);
/* If the task is part of a group prevent parallel updates to group stats */
ng = deref_curr_numa_group(p);
if (ng) {
group_lock = &ng->lock;
spin_lock_irq(group_lock);
}
/* Find the node with the highest number of faults */
//找当前task,发生最多numa faults的node
for_each_online_node(nid) {
/* Keep track of the offsets in numa_faults array */
int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
fault_types[priv] += p->numa_faults[membuf_idx];
p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
* count according to CPU use, instead of by the raw
* number of faults. Tasks with little runtime have
* little over-all impact on throughput, and thus their
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
p->numa_faults[cpubuf_idx] = 0;
p->numa_faults[mem_idx] += diff;
p->numa_faults[cpu_idx] += f_diff;
faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (ng) {
/*
* safe because we can only change our own group
*
* mem_idx represents the offset for a given
* nid and priv in a specific region because it
* is at the beginning of the numa_faults array.
*/
ng->faults[mem_idx] += diff;
ng->faults_cpu[mem_idx] += f_diff;
ng->total_faults += diff;
group_faults += ng->faults[mem_idx];
}
}
if (!ng) {
if (faults > max_faults) {
max_faults = faults;
max_nid = nid;
}
} else if (group_faults > max_faults) {
max_faults = group_faults;
max_nid = nid;
}
}
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_nid);
}
if (max_faults) {
/* Set the new preferred node */
//大于可迁移的阈值,则进行迁移task
if (max_nid != p->numa_preferred_nid)
--》 sched_setnuma(p, max_nid);
}
update_task_scan_period(p, fault_types[0], fault_types[1]);
}
kernel/sched/fair.c -sched_setnuma
/*
* Requeue a task on a given node and accurately track the number of NUMA
* tasks on the runqueues
*/
void sched_setnuma(struct task_struct *p, int nid)
{
bool queued, running;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
p->numa_preferred_nid = nid;
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
set_curr_task(rq, p);
task_rq_unlock(rq, p, &rf);
}