代码改变世界

Nova 实现的 Fit Instance NUMA to Host NUMA 算法

2019-07-04 19:00  云物互联  阅读(895)  评论(0编辑  收藏  举报

目录

前文列表

OpenStack Nova 高性能虚拟机之 NUMA 架构亲和
OpenStack Nova 高性能虚拟机之 CPU 绑定
OpenStack 高性能虚拟机之大页内存
OpenStack 虚拟机启动流程 UML 分析

numa_fit_instance_to_host

def numa_fit_instance_to_host(
        host_topology, instance_topology, limits=None,
        pci_requests=None, pci_stats=None):
    """Fit the instance topology onto the host topology.

    Given a host, instance topology, and (optional) limits, attempt to
    fit instance cells onto all permutations of host cells by calling
    the _fit_instance_cell method, and return a new InstanceNUMATopology
    with its cell ids set to host cell ids of the first successful
    permutation, or None.

    :param host_topology: objects.NUMATopology object to fit an
                          instance on
    :param instance_topology: objects.InstanceNUMATopology to be fitted
    :param limits: objects.NUMATopologyLimits that defines limits
    :param pci_requests: instance pci_requests
    :param pci_stats: pci_stats for the host

    :returns: objects.InstanceNUMATopology with its cell IDs set to host
              cell ids of the first successful permutation, or None
    """
    if not (host_topology and instance_topology):
        LOG.debug("Require both a host and instance NUMA topology to "
                  "fit instance on host.")
        return
    elif len(host_topology) < len(instance_topology):
        LOG.debug("There are not enough NUMA nodes on the system to schedule "
                  "the instance correctly. Required: %(required)s, actual: "
                  "%(actual)s",
                  {'required': len(instance_topology),
                   'actual': len(host_topology)})
        return

    emulator_threads_policy = None
    if 'emulator_threads_policy' in instance_topology:
        emulator_threads_policy = instance_topology.emulator_threads_policy

    host_cells = host_topology.cells

    # If PCI device(s) are not required, prefer host cells that don't have
    # devices attached. Presence of a given numa_node in a PCI pool is
    # indicative of a PCI device being associated with that node
    if not pci_requests and pci_stats:
        # NOTE(fanguiju): 如果主机存在 SR-IOV 网卡,但此次请求不包含 PCI 设备,
        # 则优先使用未挂载 SR-IOV 网卡的 Host NUMA Node 来完成此次请求,
        # 保证 SR-IOV 虚拟机能够更好的被分配到与 SR-IOV 网卡相同的 Host NUMA Node 中。
        host_cells = sorted(host_cells, key=lambda cell: cell.id in [
            pool['numa_node'] for pool in pci_stats.pools])

    # TODO(ndipanov): We may want to sort permutations differently
    # depending on whether we want packing/spreading over NUMA nodes
    # NOTE(fanguiju): 获取指定数量(Instance NUMA Node Count)的 Host NUMA Nodes 全排列组合,
    # 并且循环这些组合逐一与 instance_numa_topology.cells 进行配对,
    # 筛选出作为最终 Fit Instance NUMA Topology 的 Host NUMA Nodes 组合。
    for host_cell_perm in itertools.permutations(
            host_cells, len(instance_topology)):
        cells = []
        # NOTE(fanguiju): 将待验证 Host NUMA Cells 和 Instance NUMA Cells 组成元组列表。
        for host_cell, instance_cell in zip(
                host_cell_perm, instance_topology.cells):
            try:
                cpuset_reserved = 0
                if (instance_topology.emulator_threads_isolated
                    and len(cells) == 0):
                    # For the case of isolate emulator threads, to
                    # make predictable where that CPU overhead is
                    # located we always configure it to be on host
                    # NUMA node associated to the guest NUMA node
                    # 0.
                    # NOTE(fanguiju): 如果 emulator_threads_policy 是 isolated 的话,
                    # 就要专门留出一个 pCPU 来给 Emulator 使用,
                    # 这个 pCPU 不算入 Instance NUMA Topology Request 中。
                    cpuset_reserved = 1
                # (展开)
                got_cell = _numa_fit_instance_cell(
                    host_cell, instance_cell, limits, cpuset_reserved)
            except exception.MemoryPageSizeNotSupported:
                # This exception will been raised if instance cell's
                # custom pagesize is not supported with host cell in
                # _numa_cell_supports_pagesize_request function.
                break
            if got_cell is None:
                break
            cells.append(got_cell)

        if len(cells) != len(host_cell_perm):
            continue

        if not pci_requests or ((pci_stats is not None) and
                pci_stats.support_requests(pci_requests, cells)):
            return objects.InstanceNUMATopology(
                cells=cells,
                emulator_threads_policy=emulator_threads_policy)

_numa_fit_instance_cell

def _numa_fit_instance_cell(host_cell, instance_cell, limit_cell=None,
                            cpuset_reserved=0):
    """Ensure an instance cell can fit onto a host cell

    Ensure an instance cell can fit onto a host cell and, if so, return
    a new objects.InstanceNUMACell with the id set to that of the host.
    Returns None if the instance cell exceeds the limits of the host.

    :param host_cell: host cell to fit the instance cell onto
    :param instance_cell: instance cell we want to fit
    :param limit_cell: an objects.NUMATopologyLimit or None
    :param cpuset_reserved: An int to indicate the number of CPUs overhead

    :returns: objects.InstanceNUMACell with the id set to that of the
              host, or None
    """
    LOG.debug('Attempting to fit instance cell %(cell)s on host_cell '
              '%(host_cell)s', {'cell': instance_cell, 'host_cell': host_cell})
    # NOTE (ndipanov): do not allow an instance to overcommit against
    # itself on any NUMA cell
    if instance_cell.memory > host_cell.memory:
        LOG.debug('Not enough host cell memory to fit instance cell. '
                  'Required: %(required)d, actual: %(actual)d',
                  {'required': instance_cell.memory,
                   'actual': host_cell.memory})
        return

    if len(instance_cell.cpuset) + cpuset_reserved > len(host_cell.cpuset):
        LOG.debug('Not enough host cell CPUs to fit instance cell. Required: '
                  '%(required)d + %(cpuset_reserved)d as overhead, '
                  'actual: %(actual)d',
                  {'required': len(instance_cell.cpuset),
                   'actual': len(host_cell.cpuset),
                   'cpuset_reserved': cpuset_reserved})
        return

    if instance_cell.cpu_pinning_requested:
        LOG.debug('Pinning has been requested')
        # NOTE(fanguiju):(展开)CPU 绑定的核心逻辑
        new_instance_cell = _numa_fit_instance_cell_with_pinning(
            host_cell, instance_cell, cpuset_reserved)
        if not new_instance_cell:
            return
        new_instance_cell.pagesize = instance_cell.pagesize
        instance_cell = new_instance_cell

    elif limit_cell:
        LOG.debug('No pinning requested, considering limitations on usable cpu'
                  ' and memory')
        memory_usage = host_cell.memory_usage + instance_cell.memory
        cpu_usage = host_cell.cpu_usage + len(instance_cell.cpuset)
        cpu_limit = len(host_cell.cpuset) * limit_cell.cpu_allocation_ratio
        ram_limit = host_cell.memory * limit_cell.ram_allocation_ratio
        if memory_usage > ram_limit:
            LOG.debug('Host cell has limitations on usable memory. There is '
                      'not enough free memory to schedule this instance. '
                      'Usage: %(usage)d, limit: %(limit)d',
                      {'usage': memory_usage, 'limit': ram_limit})
            return
        if cpu_usage > cpu_limit:
            LOG.debug('Host cell has limitations on usable CPUs. There are '
                      'not enough free CPUs to schedule this instance. '
                      'Usage: %(usage)d, limit: %(limit)d',
                      {'usage': cpu_usage, 'limit': cpu_limit})
            return

    pagesize = None
    if instance_cell.pagesize:
        pagesize = _numa_cell_supports_pagesize_request(
            host_cell, instance_cell)
        if not pagesize:
            LOG.debug('Host does not support requested memory pagesize. '
                      'Requested: %d kB', instance_cell.pagesize)
            return
        LOG.debug('Selected memory pagesize: %(selected_mem_pagesize)d kB. '
                  'Requested memory pagesize: %(requested_mem_pagesize)d '
                   '(small = -1, large = -2, any = -3)',
                  {'selected_mem_pagesize': pagesize,
                   'requested_mem_pagesize': instance_cell.pagesize})

    instance_cell.id = host_cell.id
    instance_cell.pagesize = pagesize
    return instance_cell

_numa_fit_instance_cell_with_pinning

def _numa_fit_instance_cell_with_pinning(host_cell, instance_cell,
                                         num_cpu_reserved=0):
    """Determine if cells can be pinned to a host cell.

    :param host_cell: objects.NUMACell instance - the host cell that
                      the instance should be pinned to
    :param instance_cell: objects.InstanceNUMACell instance without any
                          pinning information
    :param num_cpu_reserved: int - number of pCPUs reserved for hypervisor

    :returns: objects.InstanceNUMACell instance with pinning information,
              or None if instance cannot be pinned to the given host
    """
    required_cpus = len(instance_cell.cpuset) + num_cpu_reserved
    if host_cell.avail_cpus < required_cpus:
        LOG.debug('Not enough available CPUs to schedule instance. '
                  'Oversubscription is not possible with pinned instances. '
                  'Required: %(required)d (%(vcpus)d + %(num_cpu_reserved)d), '
                  'actual: %(actual)d',
                  {'required': required_cpus,
                   'vcpus': len(instance_cell.cpuset),
                   'actual': host_cell.avail_cpus,
                   'num_cpu_reserved': num_cpu_reserved})
        return

    if host_cell.avail_memory < instance_cell.memory:
        LOG.debug('Not enough available memory to schedule instance. '
                  'Oversubscription is not possible with pinned instances. '
                  'Required: %(required)s, available: %(available)s, '
                  'total: %(total)s. ',
                  {'required': instance_cell.memory,
                   'available': host_cell.avail_memory,
                   'total': host_cell.memory})
        return

    if host_cell.siblings:
        LOG.debug('Using thread siblings for packing')
        # Try to pack the instance cell onto cores
        numa_cell = _pack_instance_onto_cores(
            host_cell.free_siblings, instance_cell, host_cell.id,
            max(map(len, host_cell.siblings)),
            num_cpu_reserved=num_cpu_reserved)
    else:
        if (instance_cell.cpu_thread_policy ==
                fields.CPUThreadAllocationPolicy.REQUIRE):
            LOG.info("Host does not support hyperthreading or "
                     "hyperthreading is disabled, but 'require' "
                     "threads policy was requested.")
            return

        # Straightforward to pin to available cpus when there is no
        # hyperthreading on the host
        free_cpus = [set([cpu]) for cpu in host_cell.free_cpus]
        # NOTE(fanguiju): (展开)对于开启了超线程的 Host,
        # 需要考虑 CPU 亲和性(vCPU 尽量 fit 在同一个 Core 的多个 Thread 上)
        numa_cell = _pack_instance_onto_cores(
            free_cpus, instance_cell, host_cell.id,
            num_cpu_reserved=num_cpu_reserved)

    if not numa_cell:
        LOG.debug('Failed to map instance cell CPUs to host cell CPUs')

    return numa_cell

_pack_instance_onto_cores

def _pack_instance_onto_cores(available_siblings,
                              instance_cell,
                              host_cell_id,
                              threads_per_core=1,
                              num_cpu_reserved=0):
    """Pack an instance onto a set of siblings.

    Calculate the pinning for the given instance and its topology,
    making sure that hyperthreads of the instance match up with those
    of the host when the pinning takes effect. Also ensure that the
    physical cores reserved for hypervisor on this host NUMA node do
    not break any thread policies.

    Currently the strategy for packing is to prefer siblings and try use
    cores evenly by using emptier cores first. This is achieved by the
    way we order cores in the sibling_sets structure, and the order in
    which we iterate through it.

    The main packing loop that iterates over the sibling_sets dictionary
    will not currently try to look for a fit that maximizes number of
    siblings, but will simply rely on the iteration ordering and picking
    the first viable placement.

    :param available_siblings: list of sets of CPU IDs corresponding to
                               available siblings per core
    :param instance_cell: An instance of objects.InstanceNUMACell
                          describing the pinning requirements of the
                          instance
    :param threads_per_core: number of threads per core in host's cell
    :param num_cpu_reserved: number of pCPUs reserved for hypervisor

    :returns: An instance of objects.InstanceNUMACell containing the
              pinning information, the physical cores reserved and
              potentially a new topology to be exposed to the
              instance. None if there is no valid way to satisfy the
              sibling requirements for the instance.

    """
    LOG.debug('Packing an instance onto a set of siblings: '
             '    available_siblings: %(siblings)s'
             '    instance_cell: %(cells)s'
             '    host_cell_id: %(host_cell_id)s'
             '    threads_per_core: %(threads_per_core)s',
                {'siblings': available_siblings,
                 'cells': instance_cell,
                 'host_cell_id': host_cell_id,
                 'threads_per_core': threads_per_core})

    # We build up a data structure that answers the question: 'Given the
    # number of threads I want to pack, give me a list of all the available
    # sibling sets (or groups thereof) that can accommodate it'
    sibling_sets = collections.defaultdict(list)
    for sib in available_siblings:
        # NOTE(fanguiju): `threads_no` mean that Number of host threads per
        # cores which can be used to pin vCPUs according to the policies
        for threads_no in range(1, len(sib) + 1):
            sibling_sets[threads_no].append(sib)
        # sibling_set 的意义在于根据一个 Core 下可以使用的 Thread 的数量进行了分组
        # 如下表示了:可使用 Thread >= 1 的,和可使用 Thread >=2 的 sibling_set
        # defaultdict(<type 'list'>, 
        # {1: [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])],
        #  2: [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]})
    LOG.debug('Built sibling_sets: %(siblings)s', {'siblings': sibling_sets})

    pinning = None
    threads_no = 1
    def _orphans(instance_cell, threads_per_core):
        """Number of instance CPUs which will not fill up a host core.

        Best explained by an example: consider set of free host cores as such:
            [(0, 1), (3, 5), (6, 7, 8)]
        This would be a case of 2 threads_per_core AKA an entry for 2 in the
        sibling_sets structure.

        If we attempt to pack a 5 core instance on it - due to the fact that we
        iterate the list in order, we will end up with a single core of the
        instance pinned to a thread "alone" (with id 6), and we would have one
        'orphan' vcpu.
        """
        # NOTE(fanguiju):该函数主要用于判断不会填满 Core 中的 Siblings Threads 的数量。
        # e.g.
        # [(0, 1), (2, 3)]
        #   x  x    x
        # 上述不被填满的数量为 1,即:3 % 2 == 1。上述 Siblings Thread 3 就是一个 orphans(孤儿)
        # len(instance_cell) == len(instance_cell.cpuset)
        return len(instance_cell) % threads_per_core

    def _threads(instance_cell, threads_per_core):
        """Threads to expose to the instance via the VirtCPUTopology.

        This is calculated by taking the GCD of the number of threads we are
        considering at the moment, and the number of orphans. An example for
            instance_cell = 6
            threads_per_core = 4

        So we can fit the instance as such:
            [(0, 1, 2, 3), (4, 5, 6, 7), (8, 9, 10, 11)]
              x  x  x  x    x  x

        We can't expose 4 threads, as that will not be a valid topology (all
        cores exposed to the guest have to have an equal number of threads),
        and 1 would be too restrictive, but we want all threads that guest sees
        to be on the same physical core, so we take GCD of 4 (max number of
        threads) and 2 (number of 'orphan' CPUs) and get 2 as the number of
        threads.
        """
        # NOTE(fanguiju):此次用于求得 threads_per_core 和 orphans thread 数量的最大公约数。
        # e.g. 最大公约数 (6,4) 为 2,则表示 Guest Socket Topology 中 Core 的 Thread 数量为 2。
        # 因为 Guest Socket Topology 要求所有 Core 的 Thread 的数量都是相同的,
        # 不能够一个 Core 的 Threads 数量为 6,而另一个 Core 的 Threads 数量为 4。
        # 应该是 3 个 Core 的 Threads 均为 2,这就是求得最大公约数的原因。
        # 而且我们优先不考虑每个 Core 只有一个 Thread 的情况。
        return fractions.gcd(threads_per_core, _orphans(instance_cell,
                                                        threads_per_core))

    def _get_pinning(threads_no, sibling_set, instance_cores,
                     num_cpu_reserved=0):
        """Determines pCPUs/vCPUs mapping

        Determines the pCPUs/vCPUs mapping regarding the number of
        threads which can be used per cores and pCPUs reserved.

        :param threads_no: Number of host threads per cores which can
                           be used to pin vCPUs according to the
                           policies.
        :param sibling_set: List of available threads per host cores
                            on a specific host NUMA node.
        :param instance_cores: Set of vCPUs requested.
        :param num_cpu_reserved: Number of additional host CPUs which
                                 needs to be reserved.

        NOTE: Depending on how host is configured (HT/non-HT) a thread can
              be considered as an entire core.
        """
        # threads_no * len(sibling_set) 得到不同策略下的 host_cell 可以被 pin 的(Core 或 Thread)的数量。
        if threads_no * len(sibling_set) < (
                len(instance_cores) + num_cpu_reserved):
            return None, None
            
        # Determines usable cores according the "threads number"
        # constraint.
        #
        # For a sibling_set=[(0, 1, 2, 3), (4, 5, 6, 7)] and thread_no 1:
        # usable_cores=[(0), (4),]
        #
        # For a sibling_set=[(0, 1, 2, 3), (4, 5, 6, 7)] and thread_no 2:
        # usable_cores=[(0, 1), (4, 5)]
        # NOTE(fanguiju):根据 thread_no 的数量从 sibling_set 抽取出满足的 sub-siblings_set
        # e.g.
        # (Pdb) sibling_set
        # [CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
        # (Pdb) threads_no
        # 1
        # (Pdb) usable_cores
        # [[25], [8], [9], [24], [27], [2], [3], [26]]
        usable_cores = list(map(lambda s: list(s)[:threads_no], sibling_set))

        # Determines the mapping vCPUs/pCPUs based on the sets of
        # usable cores.
        #
        # For an instance_cores=[2, 3], usable_cores=[(0), (4)]
        # vcpus_pinning=[(2, 0), (3, 4)]
        #             (vCPU, pCPU)
        # NOTE(fanguiju): 根据 instance_cores 从 usable_cores 中抽取出满足的 mapping
        # e.g.
        # (Pdb) instance_cores
        # CoercedSet([0, 1])
        # (Pdb) usable_cores
        # [[25], [8], [9], [24], [27], [2], [3], [26]]
        # (Pdb) vcpus_pinning
        # [(0, 25), (1, 8)]
        vcpus_pinning = list(zip(sorted(instance_cores),
                                 itertools.chain(*usable_cores)))
        msg = ("Computed NUMA topology CPU pinning: usable pCPUs: "
               "%(usable_cores)s, vCPUs mapping: %(vcpus_pinning)s")
        msg_args = {
            'usable_cores': usable_cores,
            'vcpus_pinning': vcpus_pinning,
        }
        LOG.info(msg, msg_args)

        cpuset_reserved = None
        if num_cpu_reserved:
            # Updates the pCPUs used based on vCPUs pinned to
            #
            # For vcpus_pinning=[(0, 2), (1, 3)], usable_cores=[(2, 3), (4, 5)]
            # usable_cores=[(), (4, 5)]
            # NOTE(fanguiju): 如果有需要预留的 CPU,则会先将已经 pinned 的 pCPU 去掉,
            # 然后再从剩余的 pCPU 中选出用于预留的 CPU。
            for vcpu, pcpu in vcpus_pinning:
                for sib in usable_cores:
                    if pcpu in sib:
                        sib.remove(pcpu)

            # Determines the pCPUs reserved for hypervisor
            #
            # For usable_cores=[(), (4, 5)], num_cpu_reserved=1
            # cpuset_reserved=[4]
            # NOTE(fanguiju): 从剩余的 pCPU 中选出用于预留的 CPU。
            cpuset_reserved = set(list(
                itertools.chain(*usable_cores))[:num_cpu_reserved])
            msg = ("Computed NUMA topology reserved pCPUs: usable pCPUs: "
                   "%(usable_cores)s, reserved pCPUs: %(cpuset_reserved)s")
            msg_args = {
                'usable_cores': usable_cores,
                'cpuset_reserved': cpuset_reserved,
            }
            LOG.info(msg, msg_args)

        return vcpus_pinning, cpuset_reserved

    if (instance_cell.cpu_thread_policy ==
            fields.CPUThreadAllocationPolicy.REQUIRE):
        LOG.debug("Requested 'require' thread policy for %d cores",
                  len(instance_cell))
    elif (instance_cell.cpu_thread_policy ==
            fields.CPUThreadAllocationPolicy.PREFER):
        LOG.debug("Requested 'prefer' thread policy for %d cores",
                  len(instance_cell))
    elif (instance_cell.cpu_thread_policy ==
            fields.CPUThreadAllocationPolicy.ISOLATE):
        LOG.debug("Requested 'isolate' thread policy for %d cores",
                  len(instance_cell))
    else:
        LOG.debug("User did not specify a thread policy. Using default "
                  "for %d cores", len(instance_cell))
                  
    # NOTE(fanguiju): ISOLATE 策略要求每个 vCPU 都 pin 到 Core 上。
    if (instance_cell.cpu_thread_policy ==
            fields.CPUThreadAllocationPolicy.ISOLATE):
        # make sure we have at least one fully free core
        if threads_per_core not in sibling_sets:
            LOG.debug('Host does not have any fully free thread sibling sets.'
                      'It is not possible to emulate a non-SMT behavior '
                      'for the isolate policy without this.')
            return
        # NOTE(fanguiju): _get_pinning 是嵌套函数。
        pinning, cpuset_reserved = _get_pinning(
            # 因为每个 vCPU 都要绑定到 Core 上,所以 thread_no 传入 1。
            1,  # we only want to "use" one thread per core
            sibling_sets[threads_per_core],
            instance_cell.cpuset,
            num_cpu_reserved=num_cpu_reserved)
    else:  # REQUIRE, PREFER (explicit, implicit)
        # NOTE(ndipanov): We iterate over the sibling sets in descending order
        # of cores that can be packed. This is an attempt to evenly distribute
        # instances among physical cores
        # NOTE(fanguiju): 根据 threads_no 数量由大到小倒序排序,先将 threads_no 数量大的 pinned 掉。
        for threads_no, sibling_set in sorted(
                (t for t in sibling_sets.items()), reverse=True):

            # NOTE(sfinucan): The key difference between the require and
            # prefer policies is that require will not settle for non-siblings
            # if this is all that is available. Enforce this by ensuring we're
            # using sibling sets that contain at least one sibling
            # NOTE(fanguiju): REQUIRE 策略要求主机必须开启超线程,所以线程数为 1 的直接过滤。
            if (instance_cell.cpu_thread_policy ==
                    fields.CPUThreadAllocationPolicy.REQUIRE):
                if threads_no <= 1:
                    LOG.debug('Skipping threads_no: %s, as it does not satisfy'
                              ' the require policy', threads_no)
                    continue

            pinning, cpuset_reserved = _get_pinning(
                threads_no, sibling_set,
                instance_cell.cpuset,
                num_cpu_reserved=num_cpu_reserved)
            if pinning:
                break

        # NOTE(sfinucan): If siblings weren't available and we're using PREFER
        # (implicitly or explicitly), fall back to linear assignment across
        # cores
        # NOTE(fanguiju):PREFER 策略,在开启了超线程的时候会先执行 REQUIRE 策略,
        # 在没有开启超线程的时候会先执行 ISOLATE 策略。
        # 如果在运行完以上策略逻辑之后依然没有得到预期的 vCPU pinning,
        # 那么最坏的结果就是随便 Pinning。
        if (instance_cell.cpu_thread_policy !=
                fields.CPUThreadAllocationPolicy.REQUIRE and
                not pinning):
            pinning = list(zip(sorted(instance_cell.cpuset),
                               itertools.chain(*sibling_set)))

        threads_no = _threads(instance_cell, threads_no)

    if not pinning:
        return
    LOG.debug('Selected cores for pinning: %s, in cell %s', pinning,
                                                            host_cell_id)
    # NOTE(fanguiju):此时 threads_no 表示每个 Instance Socket Topology 下的 Core 的 Threads 数量。
    # len(pinning) 表示已经被绑定的 vCPU 的数量,两者的地板除得到的是 Core 的数量。
    topology = objects.VirtCPUTopology(sockets=1,
                                       cores=len(pinning) // threads_no,
                                       threads=threads_no)
    instance_cell.pin_vcpus(*pinning)
    instance_cell.cpu_topology = topology
    instance_cell.id = host_cell_id
    instance_cell.cpuset_reserved = cpuset_reserved
    return instance_cell

使用示例

在下列使用示例中,我们主要关注不同策略组合下的 CPU pinning 行为模式。基础 Host NUMA Topology 环境如下:

Host NUMA Topology

[root@overcloud-sriovperformancecompute-0 ~]# python cpu_topo.py
======================================================================
Core and Socket Information (as reported by '/sys/devices/system/cpu')
======================================================================

cores =  [0, 1, 2, 3, 4, 8, 9, 10, 11, 12]
sockets =  [0, 1]

        Socket 0        Socket 1
        --------        --------
Core 0  [0, 20]         [10, 30]
Core 1  [1, 21]         [11, 31]
Core 2  [2, 22]         [12, 32]
Core 3  [3, 23]         [13, 33]
Core 4  [4, 24]         [14, 34]
Core 8  [5, 25]         [15, 35]
Core 9  [6, 26]         [16, 36]
Core 10 [7, 27]         [17, 37]
Core 11 [8, 28]         [18, 38]
Core 12 [9, 29]         [19, 39]

SR-IOV PCI NUMA

[root@overcloud-sriovperformancecompute-0 ~]# cat /sys/class/net/enp129s0f0/device/numa_node
1
[root@overcloud-sriovperformancecompute-0 ~]# cat /sys/class/net/enp129s0f1/device/numa_node
1

NUMA 对齐,CPU isolate

openstack flavor create numa_mirror_cpu_isolicy --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0,1 \
    --property hw:numa_cpus.1=2,3 \
    --property hw:numa_mem.0=2048 \
    --property hw:numa_mem.1=2048 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=isolate

openstack server create --flavor numa_mirror_cpu_isolicy --image cirros --port sriov-port1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
1
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25], [8], [9], [24], [27], [2], [3], [26]]
(Pdb) instance_cores
CoercedSet([0, 1])
(Pdb) vcpus_pinning
[(0, 25), (1, 8)]

# Instance NUMA Cell 1
(Pdb) threads_no
1
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35], [16], [32], [17], [18], [19], [33], [34]]
(Pdb) instance_cores
CoercedSet([2, 3])
(Pdb) vcpus_pinning
[(2, 35), (3, 16)]

Hypervisor

| vcpus                       | 32
| vcpus_used                  | 4

DB

{
	"nova_object.version": "1.2",
	"nova_object.changes": ["cells"],
	"nova_object.name": "NUMATopology",
	"nova_object.data": {
		"cells": [{
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 4,
				"memory_usage": 2048,
				"cpuset": [2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29],
				"pinned_cpus": [8, 25, 28, 5],
				"siblings": [
					[25, 5],
					[8, 28],
					[9, 29],
					[24, 4],
					[27, 7],
					[2, 22],
					[3, 23],
					[26, 6]
				],
				"memory": 130669,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30043517,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 13,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 0
			},
			"nova_object.namespace": "nova"
		}, {
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 4,
				"memory_usage": 2048,
				"cpuset": [12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38, 39],
				"pinned_cpus": [16, 35, 36, 15],
				"siblings": [
					[35, 15],
					[16, 36],
					[32, 12],
					[17, 37],
					[18, 38],
					[19, 39],
					[33, 13],
					[34, 14]
				],
				"memory": 131072,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30408704,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 12,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 1
			},
			"nova_object.namespace": "nova"
		}]
	},
	"nova_object.namespace": "nova"
}

Instance vCPU info

[root@overcloud-sriovperformancecompute-0 ~]# virsh vcpupin instance-000021da
VCPU: CPU Affinity
----------------------------------
   0: 25
   1: 8
   2: 35
   3: 16

小结

  • 虽然从 Hypervisor 资源的显示上看,free_vcpus = 32-4 = 28。但实际上,CPU isolate 策略会让每颗 vCPU 将一个 Core 的 siblings threads 都 pinned 掉,从而保证一个 vCPU 是 pinning 到一个 Core 上面的。所以实际的 free_vcpus = 32-(4*2) = 24。
  • 如果 Instance NUMA Node 数量为 1 的话,那么会优先考虑将 Instance vCPU Pinning 到与 SRIOV PCI 相同的 Host NUMA Node 上,但如果 Instance NUMA Node 的数量大于 1,则会优先实现 Guest NUMA Topology。

NUMA 对齐 CPU require

openstack flavor create numa_mirror_cpu_require --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0,1 \
    --property hw:numa_cpus.1=2,3 \
    --property hw:numa_mem.0=2048 \
    --property hw:numa_mem.1=2048 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=require

openstack server create --flavor numa_mirror_cpu_require --image cirros --port sriov-port1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25, 5], [8, 28], [9, 29], [24, 4], [27, 7], [2, 22], [3, 23], [26, 6]]
(Pdb) instance_cores
CoercedSet([0, 1])
(Pdb) vcpus_pinning
[(0, 25), (1, 5)]

# Instance NUMA Cell 1
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35, 15], [16, 36], [32, 12], [17, 37], [18, 38], [19, 39], [33, 13], [34, 14]]
(Pdb) instance_cores
CoercedSet([2, 3])
(Pdb) vcpus_pinning
[(2, 35), (3, 15)]

Hypervisor

| vcpus                       | 32
| vcpus_used                  | 4

DB

{
	"nova_object.version": "1.2",
	"nova_object.changes": ["cells"],
	"nova_object.name": "NUMATopology",
	"nova_object.data": {
		"cells": [{
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 2,
				"memory_usage": 2048,
				"cpuset": [2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29],
				"pinned_cpus": [25, 5],
				"siblings": [
					[25, 5],
					[8, 28],
					[9, 29],
					[24, 4],
					[27, 7],
					[2, 22],
					[3, 23],
					[26, 6]
				],
				"memory": 130669,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30043517,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 13,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 0
			},
			"nova_object.namespace": "nova"
		}, {
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 2,
				"memory_usage": 2048,
				"cpuset": [12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38, 39],
				"pinned_cpus": [35, 15],
				"siblings": [
					[35, 15],
					[16, 36],
					[32, 12],
					[17, 37],
					[18, 38],
					[19, 39],
					[33, 13],
					[34, 14]
				],
				"memory": 131072,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30408704,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 12,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 1
			},
			"nova_object.namespace": "nova"
		}]
	},
	"nova_object.namespace": "nova"
}

Instance vCPU info

[root@overcloud-sriovperformancecompute-0 ~]# virsh vcpupin instance-000021dd
VCPU: CPU Affinity
----------------------------------
   0: 25
   1: 5
   2: 35
   3: 15

小结:从 Hypervisor 资源的显示上看,free_vcpus = 32-4 = 28。实际上,CPU require 策略会优先将每个 vCPU pinning 到一个 Core 的其中一个 thread 上。所以,实际的 free_vcpus 也是 32-4 = 28。并且该策略会优先考虑将 vCPUs pinning 到同一个 Core 上(CPU 亲和性),例如:当满足条件「CPU44 和 CPU45 属于相同物理核,CPU43 和 CPU46 属于不同物理核」时候,cpu_pinning 的值为 {0:44, 1:45} 只有 2 个 vcpu_pinning,但 Flavor 却需要 4 个 vcpu_pinning,所以创建失败;当满足条件「CPU44 和 CPU45 属于相同物理核,CPU46 和 CPU47 属于相同物理核」时,cpu_pinning 的值为 {0:44, 1:45, 2:46, 3:47} 有 4 个 vcpu_pinning 能够满足 Flavor 规格,创建成功。

NUMA 对齐,CPU prefer

openstack flavor create numa_mirror_cpu_prefer --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0,1 \
    --property hw:numa_cpus.1=2,3 \
    --property hw:numa_mem.0=2048 \
    --property hw:numa_mem.1=2048 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=prefer

openstack server create --flavor numa_mirror_cpu_prefer --image cirros --port p1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25, 5], [8, 28], [9, 29], [24, 4], [27, 7], [2, 22], [3, 23], [26, 6]]
(Pdb) instance_cores
CoercedSet([0, 1])
(Pdb) vcpus_pinning
[(0, 25), (1, 5)]

# Instance NUMA Cell 1
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35, 15], [16, 36], [32, 12], [17, 37], [18, 38], [19, 39], [33, 13], [34, 14]]
(Pdb) instance_cores
CoercedSet([2, 3])
(Pdb) vcpus_pinning
[(2, 35), (3, 15)]

小结:PREFER 策略,在开启了超线程的时候会先执行 REQUIRE 策略,在没有开启超线程的时候会先执行 ISOLATE 策略。如果在运行完以上策略逻辑之后依然没有得到预期的 vCPU pinning,那么最坏的结果就是随便 Pinning,找到位置就填充。

NUMA 不对齐,CPU isolate

openstack flavor create numa_unmirror_cpu_isolicy --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0 \
    --property hw:numa_cpus.1=1,2,3 \
    --property hw:numa_mem.0=1024 \
    --property hw:numa_mem.1=3072 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=isolate

openstack server create --flavor numa_unmirror_cpu_isolicy --image cirros --port p1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
1
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25], [8], [9], [24], [27], [2], [3], [26]]
(Pdb) instance_cores
CoercedSet([0])
(Pdb) vcpus_pinning
[(0, 25)]

# Instance NUMA Cell 1
(Pdb) threads_no
1
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35], [16], [32], [17], [18], [19], [33], [34]]
(Pdb) instance_cores
CoercedSet([1, 2, 3])
(Pdb) vcpus_pinning
[(1, 35), (2, 16), (3, 32)]

DB

{
	"nova_object.version": "1.2",
	"nova_object.changes": ["cells"],
	"nova_object.name": "NUMATopology",
	"nova_object.data": {
		"cells": [{
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 2,
				"memory_usage": 1024,
				"cpuset": [2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29],
				"pinned_cpus": [25, 5],
				"siblings": [
					[25, 5],
					[8, 28],
					[9, 29],
					[24, 4],
					[27, 7],
					[2, 22],
					[3, 23],
					[26, 6]
				],
				"memory": 130669,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30043517,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 13,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 0
			},
			"nova_object.namespace": "nova"
		}, {
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 6,
				"memory_usage": 3072,
				"cpuset": [12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38, 39],
				"pinned_cpus": [32, 16, 35, 36, 12, 15],
				"siblings": [
					[35, 15],
					[16, 36],
					[32, 12],
					[17, 37],
					[18, 38],
					[19, 39],
					[33, 13],
					[34, 14]
				],
				"memory": 131072,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30408704,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 12,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 1
			},
			"nova_object.namespace": "nova"
		}]
	},
	"nova_object.namespace": "nova"
}

Instance vCPU info

[root@overcloud-sriovperformancecompute-0 ~]# virsh vcpupin instance-000021e3
VCPU: CPU Affinity
----------------------------------
   0: 25
   1: 35
   2: 16
   3: 32

小结:可以看出 NUMA 是否对其对于 CPU ISOLATE 绑定策略而言没有什么影响。

NUMA 不对齐 CPU require

openstack flavor create numa_unmirror_cpu_require --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0 \
    --property hw:numa_cpus.1=1,2,3 \
    --property hw:numa_mem.0=1024 \
    --property hw:numa_mem.1=3072 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=require

openstack server create --flavor numa_unmirror_cpu_require --image cirros --port p1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25, 5], [8, 28], [9, 29], [24, 4], [27, 7], [2, 22], [3, 23], [26, 6]]
(Pdb) instance_cores
CoercedSet([0])
(Pdb) vcpus_pinning
[(0, 25)]

# Instance NUMA Cell 1
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35, 15], [16, 36], [32, 12], [17, 37], [18, 38], [19, 39], [33, 13], [34, 14]]
(Pdb) instance_cores
CoercedSet([1, 2, 3])
(Pdb) vcpus_pinning
[(1, 35), (2, 15), (3, 16)]

Instance vCPU info

[root@overcloud-sriovperformancecompute-0 ~]# virsh vcpupin instance-000021e9
VCPU: CPU Affinity
----------------------------------
   0: 25
   1: 35
   2: 15
   3: 16

小结:在 require 策略下,且存在单数个 vCPU 绑定的时候,可能最终会出现 pCPU 孤儿的情况,此时该计算节点就很可能对下列这种情况创建失败。例如:4 个逻辑核属于同一 NUMA,其中 CPU1 和 CPU2 属于相同物理核,CPU3 和 CPU4 属于不同的物理核,若此时创建一个 Flavor vCPU 为 4 的云主机会创建失败,因为 siblings 只有 [set([1, 2])]。在 Nova Scheduler NUMA filter 上会被过滤掉,但对于单个 vCPU 绑定的需求依旧是可以成功的。

Filter NUMATopologyFilter returned 0 hosts

NUMA 不对齐,CPU prefer

openstack flavor create numa_unmirror_cpu_prefer --disk 1 --ram 4096 --vcpus 4 \
    --property hw:numa_nodes=2 \
    --property hw:numa_cpus.0=0 \
    --property hw:numa_cpus.1=1,2,3 \
    --property hw:numa_mem.0=1024 \
    --property hw:numa_mem.1=3072 \
    --property hw:cpu_policy=dedicated \
    --property hw:cpu_thread_policy=prefer

openstack server create --flavor numa_unmirror_cpu_prefer --image cirros --port p1 VM1

DEBUG

# Instance NUMA Cell 0
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([25, 5]), CoercedSet([8, 28]), CoercedSet([9, 29]), CoercedSet([24, 4]), CoercedSet([27, 7]), CoercedSet([2, 22]), CoercedSet([3, 23]), CoercedSet([26, 6])]
(Pdb) usable_cores
[[25, 5], [8, 28], [9, 29], [24, 4], [27, 7], [2, 22], [3, 23], [26, 6]]
(Pdb) instance_cores
CoercedSet([0])
(Pdb) vcpus_pinning
[(0, 25)]

# Instance NUMA Cell 1
(Pdb) threads_no
2
(Pdb) sibling_set
[CoercedSet([35, 15]), CoercedSet([16, 36]), CoercedSet([32, 12]), CoercedSet([17, 37]), CoercedSet([18, 38]), CoercedSet([19, 39]), CoercedSet([33, 13]), CoercedSet([34, 14])]
(Pdb) usable_cores
[[35, 15], [16, 36], [32, 12], [17, 37], [18, 38], [19, 39], [33, 13], [34, 14]]
(Pdb) instance_cores
CoercedSet([1, 2, 3])
(Pdb) vcpus_pinning
[(1, 35), (2, 15), (3, 16)]

DB

{
	"nova_object.version": "1.2",
	"nova_object.changes": ["cells"],
	"nova_object.name": "NUMATopology",
	"nova_object.data": {
		"cells": [{
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 1,
				"memory_usage": 1024,
				"cpuset": [2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29],
				"pinned_cpus": [25],
				"siblings": [
					[25, 5],
					[8, 28],
					[9, 29],
					[24, 4],
					[27, 7],
					[2, 22],
					[3, 23],
					[26, 6]
				],
				"memory": 130669,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30043517,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 13,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 0
			},
			"nova_object.namespace": "nova"
		}, {
			"nova_object.version": "1.2",
			"nova_object.changes": ["cpu_usage", "memory_usage", "cpuset", "pinned_cpus", "siblings", "memory", "mempages", "id"],
			"nova_object.name": "NUMACell",
			"nova_object.data": {
				"cpu_usage": 3,
				"memory_usage": 3072,
				"cpuset": [12, 13, 14, 15, 16, 17, 18, 19, 32, 33, 34, 35, 36, 37, 38, 39],
				"pinned_cpus": [16, 35, 15],
				"siblings": [
					[35, 15],
					[16, 36],
					[32, 12],
					[17, 37],
					[18, 38],
					[19, 39],
					[33, 13],
					[34, 14]
				],
				"memory": 131072,
				"mempages": [{
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 30408704,
						"reserved": 0,
						"size_kb": 4
					},
					"nova_object.namespace": "nova"
				}, {
					"nova_object.version": "1.1",
					"nova_object.changes": ["used", "total", "reserved", "size_kb"],
					"nova_object.name": "NUMAPagesTopology",
					"nova_object.data": {
						"used": 0,
						"total": 12,
						"reserved": 0,
						"size_kb": 1048576
					},
					"nova_object.namespace": "nova"
				}],
				"id": 1
			},
			"nova_object.namespace": "nova"
		}]
	},
	"nova_object.namespace": "nova"
}

Instance vCPU info

[root@overcloud-sriovperformancecompute-0 ~]# virsh vcpupin instance-000021e6
VCPU: CPU Affinity
----------------------------------
   0: 25
   1: 35
   2: 15
   3: 16

小结:在开启了超线程的情况下,prefer 策略和 require 策略基本一样。但最主要的区别在于,最坏的情况下,prefer 是可以牺牲掉 CPU 亲和性的。即:4 个逻辑核属于同一 NUMA,其中 CPU1 和 CPU2 属于相同物理核,CPU3 和 CPU4 属于不同的物理核,若此时创建一个 Flavor vCPU 为 4 的云主机会创建成功,因为剩余的 Thread 数量能够满足 Flavor 的需求,只是不再保证 CPU 亲和性而已。e.g.

available_siblings: [CoercedSet([]), CoercedSet([]), CoercedSet([]), CoercedSet([]), CoercedSet([]), CoercedSet([22]), CoercedSet([23]), CoercedSet([26, 6])]
Requested 'prefer' thread policy for 4 cores
Selected cores for pinning: [(0, 22), (1, 23), (2, 26), (3, 6)], in cell 0

主要是因为这几行代码:

# /opt/stack/queens/nova/nova/virt/hardware.py

        # NOTE(sfinucan): If siblings weren't available and we're using PREFER
        # (implicitly or explicitly), fall back to linear assignment across
        # cores
        if (instance_cell.cpu_thread_policy !=
                fields.CPUThreadAllocationPolicy.REQUIRE and
                not pinning):
            pinning = list(zip(sorted(instance_cell.cpuset),
                               itertools.chain(*sibling_set)))