-
環境檢視系統環境
# lsb_release -a
NoLSBmodulesareavailable.
Distributor ID:Ubuntu
Description:Ubuntu22.04.4LTS
Release:22.04
Codename:jammy
# cat /etc/redhat-release
RockyLinuxrelease9.3(BlueOnyx)
# kubectl version
Client Version:v1.30.2
Kustomize Version:v5.0.4-0.20230601165947-6ce0bf390ce3
Server Version:v1.25.16
WARNING:versiondifferencebetweenclient(1.30)andserver(1.25)exceedsthesupportedminorversionskewof+/-1
-
安裝Nvidia的Docker外掛在有GPU資源的主機安裝,改主機作為K8S叢集的Node設定源
#
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed
's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g'
| \
sudo
tee
/etc/apt/sources.list.d/nvidia-container-toolkit.list
#
sed -i -e
'/experimental/ s/^#//g'
/etc/apt/sources.list.d/nvidia-container-toolkit.list

# sudo apt
-getupdate
# sudo apt-get install -y nvidia-container-toolkit
# sudo nvidia-ctk runtime configure --runtime=docker
INFO[0000] Loading config from /etc/docker/daemon.json
INFO[0000] Wrote updated config to /etc/docker/daemon.json
INFO[0000] It is recommended that docker daemon be restarted.
# cat /etc/docker/daemon.json
{
"insecure-registries"
: [
"192.168.3.61"
],
"registry-mirrors"
: [
"https://7sl94zzz.mirror.aliyuncs.com"
,
"https://hub.atomgit.com"
,
"https://docker.awsl9527.cn"
],
"runtimes"
: {
"nvidia"
: {
"args"
: [],
"path"
:
"nvidia-container-runtime"
}
}
# systemctl daemon-reload
# systemctl restart docker
-
使用Docker呼叫GPU驗證配置啟動一個映象檢視GPU資訊
~# docker run
--rm --runtime=nvidia --gpus all ubuntu nvidia-smi
Sat Oct
1201
:
33
:
332024
+
-----------------------------------------------------------------------------------------+
| NVIDIA-SMI
555.42
.
06
Driver Version:
555.42
.
06
CUDA Version:
12.5
|
|
-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M |
Bus
-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
|
0
NVIDIA GeForce RTX
4090
Off |
00000000
:
01
:
00.0
Off | Off |
|
0
%
53
C P2
59
W /
450
W |
4795
MiB /
24564
MiB |
0
%
Default
|
| | | N/A |
+
-----------------------------------------+------------------------+----------------------+
+
-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID
TypeProcess
name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+
-----------------------------------------------------------------------------------------+
$
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.16.1/deployments/static/nvidia-device-plugin.yml
# cat nvidia-device-plugin.yml
apiVersion:apps/v1
kind:DaemonSet
metadata:
name:nvidia-device-plugin-daemonset
namespace:kube-system
spec:
selector:
matchLabels:
name:nvidia-device-plugin-ds
updateStrategy:
type:RollingUpdate
template:
metadata:
labels:
name:nvidia-device-plugin-ds
spec:
tolerations:
-key:nvidia.com/gpu
operator:Exists
effect:NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName:"system-node-critical"
containers:
-image:nvcr.io/nvidia/k8s-device-plugin:v0.16.1
name:nvidia-device-plugin-ctr
env:
-name:FAIL_ON_INIT_ERROR
value:"false"
securityContext:
allowPrivilegeEscalation:false
capabilities:
drop:
[
"ALL"
]
volumeMounts:
-name:device-plugin
mountPath:/var/lib/kubelet/device-plugins
volumes:
-name:device-plugin
hostPath:
path:/var/lib/kubelet/device-plugins
# kubectl logs -f nvidia-device-plugin-daemonset
-8
bltf -n kube-system
I1012
02
:
15
:
37.1710561
main.go:
199
] Starting FS watcher.
I1012
02
:
15
:
37.1712391
main.go:
206
] Starting OS watcher.
I1012
02
:
15
:
37.1721771
main.go:
221
] Starting Plugins.
I1012
02
:
15
:
37.1722361
main.go:
278
] Loading configuration.
I1012
02
:
15
:
37.1732241
main.go:
303
] Updating config
withdefault
resource matching patterns.
I1012
02
:
15
:
37.1737171
main.go:
314
]
Running
with
config:
{
"version"
:
"v1"
,
"flags"
: {
"migStrategy"
:
"none"
,
"failOnInitError"
:
false
,
"mpsRoot"
:
""
,
"nvidiaDriverRoot"
:
"/"
,
"nvidiaDevRoot"
:
"/"
,
"gdsEnabled"
:
false
,
"mofedEnabled"
:
false
,
"useNodeFeatureAPI"
:
null
,
"deviceDiscoveryStrategy"
:
"auto"
,
"plugin"
: {
"passDeviceSpecs"
:
false
,
"deviceListStrategy"
: [
"envvar"
],
"deviceIDStrategy"
:
"uuid"
,
"cdiAnnotationPrefix"
:
"cdi.k8s.io/"
,
"nvidiaCTKPath"
:
"/usr/bin/nvidia-ctk"
,
"containerDriverRoot"
:
"/driver-root"
}
},
"resources"
: {
"gpus"
: [
{
"pattern"
:
"*"
,
"name"
:
"nvidia.com/gpu"
}
]
},
"sharing"
: {
"timeSlicing"
: {}
}
}
I1012
02
:
15
:
37.1737601
main.go:
317
] Retrieving plugins.
E1012
02
:
15
:
37.1740521factory
.go:
87
] Incompatible strategy detected auto
E1012
02
:
15
:
37.1740861factory
.go:
88
] If
thisis
a GPU node, did you configure the NVIDIA Container Toolkit?
E1012
02
:
15
:
37.1740961factory
.go:
89
] You can check the prerequisites at: https:
//github.com/NVIDIA/k8s-device-plugin#prerequisites
E1012
02
:
15
:
37.1741041factory
.go:
90
] You can learn how to
set
the runtime at: https:
//github.com/NVIDIA/k8s-device-plugin#quick-start
E1012
02
:
15
:
37.1741131factory
.go:
91
] If
thisis
not a GPU node, you should
set
up a toleration or nodeSelector to only deploy
this
plugin
on
GPU nodes
I1012
02
:
15
:
37.1741231
main.go:
346
] No devices found. Waiting indefinitely.
-
該Node部署GPU節點即該Node沒有GPU資源
-
該Node有GPU資源,沒有安裝Docker驅動沒有GPU資源的節點肯定無法使用,但是已經有GPU資源的Node節點也會報這個錯誤有GPU節點的修復方法,修改配置檔案新增配置
# cat /etc/docker/daemon.json
{
"insecure-registries"
: [
"192.168.3.61"
],
"registry-mirrors"
: [
"https://7sl94zzz.mirror.aliyuncs.com"
,
"https://hub.atomgit.com"
,
"https://docker.awsl9527.cn"
],
"default-runtime"
:
"nvidia"
,
"runtimes"
: {
"nvidia"
: {
"args"
: [],
"path"
:
"/usr/bin/nvidia-container-runtime"
}
}
}

# kubectl logs -f nvidia-device-plugin-daemonset-mp5ql -n kube-system
I1012
02
:
22
:
00.9902461
main.
go
:
199
] Starting FS watcher.
I1012
02
:
22
:
00.9902781
main.
go
:
206
] Starting OS watcher.
I1012
02
:
22
:
00.9903731
main.
go
:
221
] Starting Plugins.
I1012
02
:
22
:
00.9903821
main.
go
:
278
] Loading configuration.
I1012
02
:
22
:
00.9906921
main.
go
:
303
] Updating config with
default
resource matching patterns.
I1012
02
:
22
:
00.9907761
main.
go
:
314
]
Running with config:
{
"version"
:
"v1"
,
"flags"
: {
"migStrategy"
:
"none"
,
"failOnInitError"
:
false
,
"mpsRoot"
:
""
,
"nvidiaDriverRoot"
:
"/"
,
"nvidiaDevRoot"
:
"/"
,
"gdsEnabled"
:
false
,
"mofedEnabled"
:
false
,
"useNodeFeatureAPI"
: null,
"deviceDiscoveryStrategy"
:
"auto"
,
"plugin"
: {
"passDeviceSpecs"
:
false
,
"deviceListStrategy"
: [
"envvar"
],
"deviceIDStrategy"
:
"uuid"
,
"cdiAnnotationPrefix"
:
"cdi.k8s.io/"
,
"nvidiaCTKPath"
:
"/usr/bin/nvidia-ctk"
,
"containerDriverRoot"
:
"/driver-root"
}
},
"resources"
: {
"gpus"
: [
{
"pattern"
:
"*"
,
"name"
:
"nvidia.com/gpu"
}
]
},
"sharing"
: {
"timeSlicing"
: {}
}
}
I1012
02
:
22
:
00.9907801
main.
go
:
317
] Retrieving plugins.
I1012
02
:
22
:
01.0109501
server.
go
:
216
] Starting GRPC server
for'nvidia.com/gpu'
I1012
02
:
22
:
01.0112811
server.
go
:
147
] Starting to serve
'nvidia.com/gpu'
on /
var
/lib/kubelet/device-plugins/nvidia-gpu.sock
I1012
02
:
22
:
01.0123761
server.
go
:
154
] Registered device plugin
for'nvidia.com/gpu'
with Kubelet
# kubectl describe node aiserver003087

# cat gpu_test.yaml
apiVersion:v1
kind:Pod
metadata:
name:ffmpeg-pod
spec:
nodeName:aiserver003087#指定有gpu的節點
containers:
-name:ffmpeg-container
image:nightseas/ffmpeg:latest#k8s中配置阿里的私有倉庫遇到一些問題,暫時用公共映象
command:
[
"/bin/bash"
,
"-ce"
,
"tail -f /dev/null"
]
resources:
limits:
nvidia.com/gpu:1# 請求分配 1個 GPU
# kubectl apply -f gpu_test.yaml
pod/ffmpeg-pod configured
#
kubectl
cp
test.mp4 ffmpeg-pod:/root
#
kubectl
exec
-it ffmpeg-pod bash
# ffmpeg -hwaccel cuvid -c:v h264_cuvid -i
test.
mp4 -vf scale_npp=
1280
:
720
-vcodec h264_nvenc
out.
mp4
# kubectl label nodes aiserver003087 gpu=true
deployment配置檔案修改位置是一致的

修改gpu測試Pod的yaml檔案使用標籤選擇器
# cat gpu_test.yaml
apiVersion:v1
kind:Pod
metadata:
name:ffmpeg-pod
spec:
#nodeName: aiserver003087 #指定有gpu的節點
containers:
-name:ffmpeg-container
image:nightseas/ffmpeg:latest#k8s中配置阿里的私有倉庫遇到一些問題,暫時用公共映象
command:
[
"/bin/bash"
,
"-ce"
,
"tail -f /dev/null"
]
resources:
limits:
nvidia.com/gpu:1
nodeSelector:
gpu:"true"
#kubernetes.io/os: linux
注意: 標籤選擇器需要值需要新增雙引號"true"否則apply會報錯,不能把bool值作為對應的值應用至標籤選擇器
