使用Kueue调度Jobs #
仓库路径: https://github.com/kubernetes-sigs/kueue
当前最新版本: v0.6.0
安装kqueue #
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/v0.6.0/manifests.yaml
查看Pod运行情况
[root❄master-node:~]☭ kubectl -n kueue-system get pods
NAME READY STATUS RESTARTS AGE
kueue-controller-manager-85c4bbc999-lk4vv 2/2 Running 0 12m
[root❄master-node:~]☭
创建ResourceFlavor #
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: default-flavor # This ResourceFlavor will be used for all the resources
创建ClusterQueue #
apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
namespaceSelector: {} # Available to all namespaces
queueingStrategy: BestEffortFIFO # Default queueing strategy
resourceGroups:
- coveredResources: ["cpu", "memory", "nvidia.com/gpu", "ephemeral-storage"]
flavors:
- name: "default-flavor"
resources:
- name: "cpu"
nominalQuota: 10
- name: "memory"
nominalQuota: 10Gi
- name: "nvidia.com/gpu"
nominalQuota: 10
- name: "ephemeral-storage"
nominalQuota: 10Gi
创建LocalQueue #
---
apiVersion: v1
kind: Namespace
metadata:
name: "team-a"
---
apiVersion: v1
kind: Namespace
metadata:
name: "team-b"
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: team-a # LocalQueue under team-a namespace
name: lq-team-a
spec:
clusterQueue: cluster-queue # Point to the ClusterQueue
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: team-b # LocalQueue under team-b namespace
name: lq-team-b
spec:
clusterQueue: cluster-queue # Point to the ClusterQueue
创建Jobs #
apiVersion: batch/v1
kind: Job
metadata:
namespace: team-a # Job under team-a namespace
generateName: sample-job-team-a-
annotations:
kueue.x-k8s.io/queue-name: lq-team-a # Point to the LocalQueue
spec:
ttlSecondsAfterFinished: 60 # Job will be deleted after 60 seconds
parallelism: 3 # This Job will have 3 replicas running at the same time
completions: 3 # This Job requires 3 completions
suspend: true # Set to true to allow Kueue to control the Job when it starts
template:
spec:
nodeSelector:
cloud.google.com/gke-accelerator: "nvidia-tesla-t4" # Specify the GPU hardware
containers:
- name: dummy-job
image: ir0cn/mtool:latest
args: ["sleep", "10"] # Sleep for 10 seconds
resources:
requests:
cpu: "500m"
memory: "512Mi"
ephemeral-storage: "512Mi"
nvidia.com/gpu: "1"
limits:
cpu: "500m"
memory: "512Mi"
ephemeral-storage: "512Mi"
nvidia.com/gpu: "1"
restartPolicy: Never