创建可调用GPU的pod
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-master
namespace: gpu
spec:
replicas: 1
selector:
matchLabels:
app: gpu-master
template:
metadata:
labels:
app: gpu-master
spec:
hostname: gpu-master
containers:
- name: gpu-master
image: 192.168.168.10:5000/library/pytorch-gpu:v3
env:
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: NVIDIA_VISIBLE_DEVICES
value: all
securityContext:
privileged: true
runAsUser: 0
resources:
limits:
/gpu: "1"
requests:
/gpu: "1"
volumeMounts:
- name: code-host-path
mountPath: /persistent
volumes:
- name: code-host-path
hostPath:
path: /root/gpu/gpucode
创建可调用GPU的job
apiVersion: batch/v1
kind: Job
metadata:
labels:
app-name: gpu-job
job-name: gpu-job
name: gpu-job
namespace: gpu
spec:
backoffLimit: 6
parallelism: 1
template:
metadata:
labels:
app-name: gpu-job
job-name: gpu-job
name: gpu-job
spec:
containers:
- command:
- /bin/bash
- -c
- '/usr/local/anaconda2/envs/edu_pytorch/bin/python3.6 /persistent/test.py '
image: 192.168.168.10:5000/library/pytorch-gpu:v3
env:
- name: NVIDIA_DRIVER_CAPABILITIES
value: compute,utility
- name: NVIDIA_VISIBLE_DEVICES
value: all
imagePullPolicy: IfNotPresent
name: gpu-job
resources:
limits:
/gpu: "1"
requests:
/gpu: "1"
securityContext:
privileged: true
procMount: Default
volumeMounts:
- name: code-host-path
mountPath: /persistent
dnsPolicy: ClusterFirst
hostname: gpu-job
restartPolicy: OnFailure
schedulerName: default-scheduler
securityContext: {}
volumes:
- name: code-host-path
hostPath:
path: /root/gpu/gpucode