kubernetes集群numa拓扑感知调度方案调研

node-feature-discovery
noderesourcetopology-scheduler
参考

node-feature-discovery

kubernetes-sigs/node-feature-discovery: Node feature discovery for Kubernetes (github.com)

目标：将每个节点的详细信息暴露出来。默认安装chart不包含numa信息，需要修改chart中的部分配置。

并且集群版本低于1.23时，需要修改kubelet配置，开启featureGate：KubeletPodResourcesGetAllocatable

helm repo add nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts
helm repo update

# 开启topology updater
topologyUpdater:
  createCRDs: true
  enable: true
  rbac:
    create: true
  serviceAccount:
    create: true

helm install nfd/node-feature-discovery --namespace $NFD_NS --create-namespace --generate-name

确定nfd命名空间内的pod都正常启动后，验证各个节点的NodeResourceTopology被创建

apiVersion: topology.node.k8s.io/v1alpha1
kind: NodeResourceTopology
metadata:
  name: node10
topologyPolicies:
  - None
zones:
  - costs:
      - name: node-0
        value: 10
    name: node-0
    resources:
      - allocatable: '0'
        available: '0'
        capacity: '3'
        name: cpu
    type: Node

noderesourcetopology-scheduler

out of tree scheduler plugin：scheduler-plugins/README.md at master · kubernetes-sigs/scheduler-plugins (github.com)

分支：release-1.21

问题

早期开发中，master分之和release-1.21分支差别巨大，后期可能会有大量变动
依赖nfd提供的NodeResourceTopology作为node拓扑信息来源，但是新的NodeResourceTopology改成了non namespaced资源，需要注意两个组件的版本搭配和使用方式
release-1.21中只实现了SingleNUMANode，kubelet topology manager中的另外三种策略并未实现

filter扩展点

QOS：有request是必须的，否则不处理

 if v1qos.GetPodQOS(pod) == v1.PodQOSBestEffort {
     return nil
 }

获取node拓扑（拿到的是上面CR里的zones）

nodeTopology := findNodeTopology(nodeName, &tm.nodeResTopologyPlugin)

// 根据配置文件中的命名空间，从中找nfd的CR，读取numa拓扑信息
func findNodeTopology(nodeName string, nodeResTopoPlugin *nodeResTopologyPlugin) *topologyv1alpha1.NodeResourceTopology {
 klog.V(5).InfoS("Namespaces for nodeResTopoPlugin", "namespaces", nodeResTopoPlugin.namespaces)
 for _, namespace := range nodeResTopoPlugin.namespaces {
     klog.V(5).InfoS("Lister for nodeResTopoPlugin", "lister", nodeResTopoPlugin.lister)
     // NodeTopology couldn't be placed in several namespaces simultaneously
     lister := nodeResTopoPlugin.lister
     nodeTopology, err := (*lister).NodeResourceTopologies(namespace).Get(nodeName)
     if err != nil {
         klog.V(5).ErrorS(err, "Cannot get NodeTopologies from NodeResourceTopologyNamespaceLister")
         continue
     }
     if nodeTopology != nil {
         return nodeTopology
     }
 }
 return nil

遍历过滤策略，如果有任何一个策略满足，node被过滤掉

SingleNUMANodeContainerLevel：遍历所有initcontainer，container，依次判断是否有合适的numa node可以调度（要求每一个容器都必须满足下面的需求匹配函数）

func singleNUMAContainerLevelHandler(pod *v1.Pod, zones topologyv1alpha1.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
   klog.V(5).InfoS("Single NUMA node handler")

   // prepare NUMANodes list from zoneMap
   nodes := createNUMANodeList(zones)
   qos := v1qos.GetPodQOS(pod)

   // We count here in the way TopologyManager is doing it, IOW we put InitContainers
   // and normal containers in the one scope
   for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
       if resMatchNUMANodes(nodes, container.Resources.Requests, qos, nodeInfo) {
           // definitely we can't align container, so we can't align a pod
           return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("cannot align container: %s", container.Name))
       }
   }
   return nil
}

SingleNUMANodePodLevel：把pod里的所有资源加总，以总资源判断一次是否有合适的numa node可以调度

func singleNUMAPodLevelHandler(pod *v1.Pod, zones topologyv1alpha1.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
   klog.V(5).InfoS("Pod Level Resource handler")
   resources := make(v1.ResourceList)

   // We count here in the way TopologyManager is doing it, IOW we put InitContainers
   // and normal containers in the one scope
   for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
       for resource, quantity := range container.Resources.Requests {
           if q, ok := resources[resource]; ok {
               quantity.Add(q)
           }
           resources[resource] = quantity
       }
   }

   if resMatchNUMANodes(createNUMANodeList(zones), resources, v1qos.GetPodQOS(pod), nodeInfo) {
       // definitely we can't align container, so we can't align a pod
       return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("cannot align pod: %s", pod.Name))
   }
   return nil
}

核心逻辑，如何判断numa nodes和资源需求的匹配度：

将2拿到的zones转成numa nodes
两层循环遍历numa node和resource，判断该resource在该numa node上是否满足需求
如果有一种资源所有的numa node都不满足，则快速结束，不再继续
最后所有的资源和numa node遍历结束后，返回false，代表node未被过滤掉

// resMatchNUMANodes checks for sufficient resource, this function
// requires NUMANodeList with properly populated NUMANode, NUMAID should be in range 0-63
func resMatchNUMANodes(numaNodes NUMANodeList, resources v1.ResourceList, qos v1.PodQOSClass, nodeInfo *framework.NodeInfo) bool {
 bitmask := bm.NewEmptyBitMask()
 // set all bits, each bit is a NUMA node, if resources couldn't be aligned
 // on the NUMA node, bit should be unset
 bitmask.Fill()

 zeroQuantity := resource.MustParse("0")
 for resource, quantity := range resources {
     // for each requested resource, calculate which NUMA slots are good fits, and then AND with the aggregated bitmask, IOW unset appropriate bit if we can't align resources, or set it
     // obvious, bits which are not in the NUMA id's range would be unset
     resourceBitmask := bm.NewEmptyBitMask()
     for _, numaNode := range numaNodes {
         numaQuantity, ok := numaNode.Resources[resource]
         // if the requested resource can't be found on the NUMA node, we still need to check
         // if the resource can be found at the node itself, because there are resources which are not NUMA aligned
         // or not supported by the topology exporter - if resource was not found at both checks - skip (don't set it as available NUMA node).
         // if the un-found resource has 0 quantity probably this numa node can be considered.
         if !ok && !resourceFoundOnNode(resource, quantity, nodeInfo) && quantity.Cmp(zeroQuantity) != 0 {
             continue
         }
         // Check for the following:
         // 1. set numa node as possible node if resource is memory or Hugepages
         // 2. set numa node as possible node if resource is cpu and it's not guaranteed QoS, since cpu will flow
         // 3. set numa node as possible node if zero quantity for non existing resource was requested
         // 4. otherwise check amount of resources
         if resource == v1.ResourceMemory ||
             strings.HasPrefix(string(resource), v1.ResourceHugePagesPrefix) ||
             resource == v1.ResourceCPU && qos != v1.PodQOSGuaranteed ||
             quantity.Cmp(zeroQuantity) == 0 ||
             numaQuantity.Cmp(quantity) >= 0 {
             // possible to align resources on NUMA node
             resourceBitmask.Add(numaNode.NUMAID)
         }
     }
     bitmask.And(resourceBitmask)
     if bitmask.IsEmpty() {
         return true
     }
 }
 return bitmask.IsEmpty()
}

完成了filter扩展点后，剩余的都是能够满足负载的资源需求跑在单numa node上的节点。

score扩展点

获取node拓扑（和上面filter扩展点一样）

根据pod/container范围，选择相应的打分器

pod范围：汇总所有容器的资源request，将zones转换成numa nodes，调用核心打分函数

func podScopeScore(pod *v1.Pod, zones topologyv1alpha1.ZoneList, scorerFn scoreStrategy, resourceToWeightMap resourceToWeightMap) (int64, *framework.Status) {
  // This code is in Admit implementation of pod scope
  // https://github.com/kubernetes/kubernetes/blob/9ff3b7e744b34c099c1405d9add192adbef0b6b1/pkg/kubelet/cm/topologymanager/scope_pod.go#L52
  // but it works with HintProviders, takes into account all possible allocations.
  containers := append(pod.Spec.InitContainers, pod.Spec.Containers...)
  resources := make(v1.ResourceList)

  for _, container := range containers {
      for resource, quantity := range container.Resources.Requests {
          if quan, ok := resources[resource]; ok {
              quantity.Add(quan)
          }
          resources[resource] = quantity
      }
  }
  allocatablePerNUMA := createNUMANodeList(zones)
  return scoreForEachNUMANode(resources, allocatablePerNUMA, scorerFn, resourceToWeightMap), nil
}

container范围：对每个容器的request调用核心打分函数，然后求平均

func containerScopeScore(pod *v1.Pod, zones topologyv1alpha1.ZoneList, scorerFn scoreStrategy, resourceToWeightMap resourceToWeightMap) (int64, *framework.Status) {
  // This code is in Admit implementation of container scope
  // https://github.com/kubernetes/kubernetes/blob/9ff3b7e744b34c099c1405d9add192adbef0b6b1/pkg/kubelet/cm/topologymanager/scope_container.go#L52
  containers := append(pod.Spec.InitContainers, pod.Spec.Containers...)
  contScore := make([]float64, len(containers))
  allocatablePerNUMA := createNUMANodeList(zones)

  for i, container := range containers {
      contScore[i] = float64(scoreForEachNUMANode(container.Resources.Requests, allocatablePerNUMA, scorerFn, resourceToWeightMap))
  }
  return int64(stat.Mean(contScore, nil)), nil
}

核心逻辑：打分函数（per numa node）

第一层循环：遍历所有request在节点上每个numa上的分数，取最小值。（为什么取最小值？为了规避某些特殊情况。相当于选节点的逻辑是从最坏的情况中选最好的。）

func scoreForEachNUMANode(requested v1.ResourceList, numaList NUMANodeList, score scoreStrategy, resourceToWeightMap resourceToWeightMap) int64 {
  numaScores := make([]int64, len(numaList))
  minScore := int64(0)

  for _, numa := range numaList {
      numaScore := score(requested, numa.Resources, resourceToWeightMap)
      // if NUMA's score is 0, i.e. not fit at all, it won't be take under consideration by Kubelet.
      if (minScore == 0) || (numaScore != 0 && numaScore < minScore) {
          minScore = numaScore
      }
      numaScores[numa.NUMAID] = numaScore
  }

  klog.V(5).InfoS("Score for NUMA nodes", "numaScores", numaScores, "nodeScore", minScore)
  return minScore
}

接下来就是更加核心的，对于[]request和一个numa，如何评分。插件提供了三种方案，可在配置文件中的scoringStrategy字段配置。对于多种资源，会迭代资源类型，单独计算分数后加权平均。

MostAllocated：占可分配资源比例越高，分数越高

func mostAllocatedScoreStrategy(requested, allocatable v1.ResourceList, resourceToWeightMap resourceToWeightMap) int64 {
   var numaNodeScore int64 = 0
   var weightSum int64 = 0

   for resourceName := range requested {
       // We don't care what kind of resources are being requested, we just iterate all of them.
       // If NUMA zone doesn't have the requested resource, the score for that resource will be 0.
       resourceScore := mostAllocatedScore(requested[resourceName], allocatable[resourceName])
       weight := resourceToWeightMap.weight(resourceName)
       numaNodeScore += resourceScore * weight
       weightSum += weight
   }

   return numaNodeScore / weightSum
}

func mostAllocatedScore(requested, numaCapacity resource.Quantity) int64 {
   if numaCapacity.CmpInt64(0) == 0 {
       return 0
   }
   if requested.Cmp(numaCapacity) > 0 {
       return 0
   }

   return requested.Value() * framework.MaxNodeScore / numaCapacity.Value()
}

LeastAllocated：1-MostAllocated和1正好相反占可分配资源比例越低，分数越高

func leastAllocatedScoreStrategy(requested, allocatable v1.ResourceList, resourceToWeightMap resourceToWeightMap) int64 {
   var numaNodeScore int64 = 0
   var weightSum int64 = 0

   for resourceName := range requested {
       // We don't care what kind of resources are being requested, we just iterate all of them.
       // If NUMA zone doesn't have the requested resource, the score for that resource will be 0.
       resourceScore := leastAllocatedScore(requested[resourceName], allocatable[resourceName])
       weight := resourceToWeightMap.weight(resourceName)
       numaNodeScore += resourceScore * weight
       weightSum += weight
   }

   return numaNodeScore / weightSum
}

func leastAllocatedScore(requested, numaCapacity resource.Quantity) int64 {
   if numaCapacity.CmpInt64(0) == 0 {
       return 0
   }
   if requested.Cmp(numaCapacity) > 0 {
       return 0
   }
   numaValue := numaCapacity.Value()
   requestedValue := requested.Value()
   return (numaValue - requestedValue) * framework.MaxNodeScore / numaCapacity.Value()
}

BalancedAllocation：和前两种不太一样。迭代每种resource，计算request在该numa上的占比，最后计算方差。最后返回的分值是1-方差，即方差越小分值越大，即本次调度的request在numa内可用资源上的占比越均匀，评分越高。（避免碎片？）

func balancedAllocationScoreStrategy(requested, allocatable v1.ResourceList, resourceToWeightMap resourceToWeightMap) int64 {
   resourceFractions := make([]float64, 0)

   // We don't care what kind of resources are being requested, we just iterate all of them.
   // If NUMA zone doesn't have the requested resource, the score for that resource will be 0.
   for resourceName := range requested {
       resourceFraction := fractionOfCapacity(requested[resourceName], allocatable[resourceName])
       // if requested > capacity the corresponding NUMA zone should never be preferred
       if resourceFraction > 1 {
           return 0
       }
       resourceFractions = append(resourceFractions, resourceFraction)
   }

   variance := stat.Variance(resourceFractions, nil)

   // Since the variance is between positive fractions, it will be positive fraction. 1-variance lets the
   // score to be higher for node which has least variance and multiplying it with `MaxNodeScore` provides the scaling
   // factor needed.
   return int64((1 - variance) * float64(framework.MaxNodeScore))
}

func fractionOfCapacity(requested, capacity resource.Quantity) float64 {
   if capacity.Value() == 0 {
       return 1
   }
   return float64(requested.Value()) / float64(capacity.Value())
}

参考

k8s资源拓扑感知——资源分配 | Edwardesire

Kubernetes搭配NUMA帶你飛. 對k8s提供的TopologyManager… | by 身為DevOps工程師 | Gemini Open Cloud 雙子星雲端 | Medium

文章目录

kubernetes集群numa拓扑感知调度方案调研

node-feature-discovery

noderesourcetopology-scheduler

问题

filter扩展点

score扩展点

参考

添加新评论

传送门

热门文章

标签

最近回复

访客

其它