mirror of
https://github.com/beclab/Olares
synced 2026-05-24 09:18:23 +00:00
feat: schedule/allocate pod by gpu bindings and different share modes (#1363)
This commit is contained in:
parent
e58743fa87
commit
0c5a80653e
29 changed files with 285 additions and 54 deletions
|
|
@ -1,6 +1,6 @@
|
|||
apiVersion: v2
|
||||
name: hami
|
||||
version: 2.5.0
|
||||
version: 2.5.1
|
||||
kubeVersion: ">= 1.16.0"
|
||||
description: Heterogeneous AI Computing Virtualization Middleware
|
||||
keywords:
|
||||
|
|
@ -12,4 +12,4 @@ maintainers:
|
|||
email: limengxuan@4paradigm.com
|
||||
- name: zhangxiao
|
||||
email: xiaozhang0210@hotmail.com
|
||||
appVersion: "2.5.0"
|
||||
appVersion: "2.5.1"
|
||||
|
|
|
|||
|
|
@ -0,0 +1,103 @@
|
|||
---
|
||||
apiVersion: apiextensions.k8s.io/v1
|
||||
kind: CustomResourceDefinition
|
||||
metadata:
|
||||
annotations:
|
||||
controller-gen.kubebuilder.io/version: v0.17.2
|
||||
name: gpubindings.gpu.bytetrade.io
|
||||
spec:
|
||||
group: gpu.bytetrade.io
|
||||
names:
|
||||
kind: GPUBinding
|
||||
listKind: GPUBindingList
|
||||
plural: gpubindings
|
||||
singular: gpubinding
|
||||
scope: Cluster
|
||||
versions:
|
||||
- name: v1alpha1
|
||||
schema:
|
||||
openAPIV3Schema:
|
||||
properties:
|
||||
apiVersion:
|
||||
description: |-
|
||||
APIVersion defines the versioned schema of this representation of an object.
|
||||
Servers should convert recognized schemas to the latest internal value, and
|
||||
may reject unrecognized values.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
|
||||
type: string
|
||||
kind:
|
||||
description: |-
|
||||
Kind is a string value representing the REST resource this object represents.
|
||||
Servers may infer this from the endpoint the client submits requests to.
|
||||
Cannot be updated.
|
||||
In CamelCase.
|
||||
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
|
||||
type: string
|
||||
metadata:
|
||||
type: object
|
||||
spec:
|
||||
properties:
|
||||
appName:
|
||||
type: string
|
||||
memory:
|
||||
anyOf:
|
||||
- type: integer
|
||||
- type: string
|
||||
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
|
||||
x-kubernetes-int-or-string: true
|
||||
podSelector:
|
||||
description: |-
|
||||
A label selector is a label query over a set of resources. The result of matchLabels and
|
||||
matchExpressions are ANDed. An empty label selector matches all objects. A null
|
||||
label selector matches no objects.
|
||||
properties:
|
||||
matchExpressions:
|
||||
description: matchExpressions is a list of label selector requirements.
|
||||
The requirements are ANDed.
|
||||
items:
|
||||
description: |-
|
||||
A label selector requirement is a selector that contains values, a key, and an operator that
|
||||
relates the key and values.
|
||||
properties:
|
||||
key:
|
||||
description: key is the label key that the selector applies
|
||||
to.
|
||||
type: string
|
||||
operator:
|
||||
description: |-
|
||||
operator represents a key's relationship to a set of values.
|
||||
Valid operators are In, NotIn, Exists and DoesNotExist.
|
||||
type: string
|
||||
values:
|
||||
description: |-
|
||||
values is an array of string values. If the operator is In or NotIn,
|
||||
the values array must be non-empty. If the operator is Exists or DoesNotExist,
|
||||
the values array must be empty. This array is replaced during a strategic
|
||||
merge patch.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- key
|
||||
- operator
|
||||
type: object
|
||||
type: array
|
||||
matchLabels:
|
||||
additionalProperties:
|
||||
type: string
|
||||
description: |-
|
||||
matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
|
||||
map is equivalent to an element of matchExpressions, whose key field is "key", the
|
||||
operator is "In", and the values array contains only "value". The requirements are ANDed.
|
||||
type: object
|
||||
type: object
|
||||
x-kubernetes-map-type: atomic
|
||||
uuid:
|
||||
type: string
|
||||
required:
|
||||
- appName
|
||||
- uuid
|
||||
type: object
|
||||
type: object
|
||||
served: true
|
||||
storage: true
|
||||
|
|
@ -23,6 +23,17 @@ If release name contains chart name it will be used as a full name.
|
|||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
Allow the release namespace to be overridden for multi-namespace deployments in combined charts
|
||||
*/}}
|
||||
{{- define "hami-vgpu.namespace" -}}
|
||||
{{- if .Values.namespaceOverride -}}
|
||||
{{- .Values.namespaceOverride -}}
|
||||
{{- else -}}
|
||||
{{- .Release.Namespace -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
{{/*
|
||||
The app name for Scheduler
|
||||
*/}}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: v1
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: apps/v1
|
|||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -26,8 +27,11 @@ spec:
|
|||
app.kubernetes.io/component: hami-device-plugin
|
||||
hami.io/webhook: ignore
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
|
||||
annotations:
|
||||
checksum/hami-device-plugin-config: {{ include (print $.Template.BasePath "/device-plugin/configmap.yaml") . | sha256sum }}
|
||||
checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
|
||||
{{- if .Values.devicePlugin.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
|
||||
{{- toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.devicePlugin.runtimeClassName }}
|
||||
|
|
@ -112,12 +116,12 @@ spec:
|
|||
- name: NVIDIA_MIG_MONITOR_DEVICES
|
||||
value: all
|
||||
- name: HOOK_PATH
|
||||
value: '{{ .Values.global.gpuHookPath }}/vgpu'
|
||||
value: {{ .Values.global.gpuHookPath }}/vgpu
|
||||
resources:
|
||||
{{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
- name: ctrs
|
||||
mountPath: '{{ .Values.devicePlugin.monitorctrPath }}'
|
||||
mountPath: {{ .Values.devicePlugin.monitorctrPath }}
|
||||
- name: dockers
|
||||
mountPath: /run/docker
|
||||
- name: containerds
|
||||
|
|
@ -131,7 +135,7 @@ spec:
|
|||
volumes:
|
||||
- name: ctrs
|
||||
hostPath:
|
||||
path: '{{ .Values.devicePlugin.monitorctrPath }}'
|
||||
path: {{ .Values.devicePlugin.monitorctrPath }}
|
||||
- name: hosttmp
|
||||
hostPath:
|
||||
path: /tmp
|
||||
|
|
@ -143,10 +147,10 @@ spec:
|
|||
path: /run/containerd
|
||||
- name: device-plugin
|
||||
hostPath:
|
||||
path: '{{ .Values.devicePlugin.pluginPath }}'
|
||||
path: {{ .Values.devicePlugin.pluginPath }}
|
||||
- name: lib
|
||||
hostPath:
|
||||
path: '{{ .Values.devicePlugin.libPath }}'
|
||||
path: {{ .Values.devicePlugin.libPath }}
|
||||
- name: usrbin
|
||||
hostPath:
|
||||
path: /usr/bin
|
||||
|
|
|
|||
|
|
@ -13,4 +13,4 @@ roleRef:
|
|||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: v1
|
|||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-device-plugin
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ apiVersion: v1
|
|||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.device-plugin" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-device-plugin"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
{{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }}
|
||||
apiVersion: node.k8s.io/v1
|
||||
kind: RuntimeClass
|
||||
metadata:
|
||||
name: {{ .Values.devicePlugin.runtimeClassName }}
|
||||
annotations:
|
||||
helm.sh/hook: pre-install,pre-upgrade
|
||||
handler: nvidia
|
||||
{{- end }}
|
||||
|
|
@ -11,12 +11,5 @@ spec:
|
|||
targetPort: 6000
|
||||
selector:
|
||||
name: gpu-scheduler
|
||||
clusterIP: None
|
||||
clusterIPs:
|
||||
- None
|
||||
type: ClusterIP
|
||||
sessionAffinity: None
|
||||
ipFamilies:
|
||||
- IPv4
|
||||
ipFamilyPolicy: SingleStack
|
||||
internalTrafficPolicy: Cluster
|
||||
internalTrafficPolicy: Local
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
{{- if .Values.scheduler.certManager.enabled }}
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-serving-cert
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
spec:
|
||||
dnsNames:
|
||||
- {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc
|
||||
- {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local
|
||||
issuerRef:
|
||||
kind: Issuer
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
|
||||
secretName: {{ include "hami-vgpu.scheduler.tls" . }}
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Issuer
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
spec:
|
||||
selfSigned: {}
|
||||
{{- end }}
|
||||
|
|
@ -3,6 +3,7 @@ apiVersion: v1
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -40,6 +41,14 @@ data:
|
|||
},
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.devices.enflame.enabled }}
|
||||
{{- range .Values.devices.enflame.customresources }}
|
||||
{
|
||||
"name": "{{ . }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{
|
||||
"name": "{{ .Values.resourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
|
|
@ -80,6 +89,10 @@ data:
|
|||
"name": "{{ .Values.iluvatarResourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "metax-tech.com/gpu",
|
||||
"ignoredByScheduler": true
|
||||
},
|
||||
{
|
||||
"name": "{{ .Values.metaxResourceName }}",
|
||||
"ignoredByScheduler": true
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ apiVersion: v1
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-newversion
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -49,6 +50,8 @@ data:
|
|||
ignoredByScheduler: true
|
||||
- name: {{ .Values.iluvatarResourceName }}
|
||||
ignoredByScheduler: true
|
||||
- name: "metax-tech.com/gpu"
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.metaxResourceName }}
|
||||
ignoredByScheduler: true
|
||||
- name: {{ .Values.metaxResourceCore }}
|
||||
|
|
@ -67,4 +70,10 @@ data:
|
|||
ignoredByScheduler: true
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.devices.enflame.enabled }}
|
||||
{{- range .Values.devices.enflame.customresources }}
|
||||
- name: {{ . }}
|
||||
ignoredByScheduler: true
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: apps/v1
|
|||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -27,8 +28,15 @@ spec:
|
|||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
|
||||
hami.io/webhook: ignore
|
||||
annotations:
|
||||
{{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }}
|
||||
checksum/hami-scheduler-newversion-config: {{ include (print $.Template.BasePath "/scheduler/configmapnew.yaml") . | sha256sum }}
|
||||
{{- else }}
|
||||
checksum/hami-scheduler-config: {{ include (print $.Template.BasePath "/scheduler/configmap.yaml") . | sha256sum }}
|
||||
{{- end }}
|
||||
checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
|
||||
{{- if .Values.scheduler.podAnnotations }}
|
||||
annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }}
|
||||
{{- toYaml .Values.scheduler.podAnnotations | nindent 8 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
|
||||
|
|
@ -53,7 +61,7 @@ spec:
|
|||
{{- end }}
|
||||
- --leader-elect={{ .Values.scheduler.leaderElect }}
|
||||
- --leader-elect-resource-name={{ .Values.schedulerName }}
|
||||
- --leader-elect-resource-namespace={{ .Release.Namespace }}
|
||||
- --leader-elect-resource-namespace={{ include "hami-vgpu.namespace" . }}
|
||||
resources:
|
||||
{{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }}
|
||||
volumeMounts:
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: v1
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}-device
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -17,13 +18,14 @@ data:
|
|||
resourceCoreName: {{ .Values.resourceCores }}
|
||||
resourcePriorityName: {{ .Values.resourcePriority }}
|
||||
overwriteEnv: false
|
||||
defaultMemory: 16000
|
||||
defaultMemory: 0
|
||||
defaultCores: 0
|
||||
defaultGPUNum: 1
|
||||
deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
|
||||
deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
|
||||
deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
|
||||
gpuCorePolicy: {{ .Values.devices.nvidia.gpuCorePolicy }}
|
||||
runtimeClassName: "{{ .Values.devicePlugin.runtimeClassName }}"
|
||||
knownMigGeometries:
|
||||
- models: [ "A30" ]
|
||||
allowedGeometries:
|
||||
|
|
@ -91,10 +93,12 @@ data:
|
|||
resourceCoreName: {{ .Values.dcuResourceCores }}
|
||||
metax:
|
||||
resourceCountName: "metax-tech.com/gpu"
|
||||
|
||||
resourceVCountName: {{ .Values.metaxResourceName }}
|
||||
resourceVMemoryName: {{ .Values.metaxResourceMem }}
|
||||
resourceVCoreName: {{ .Values.metaxResourceCore }}
|
||||
enflame:
|
||||
resourceCountName: "enflame.com/vgcu"
|
||||
resourcePercentageName: "enflame.com/vgcu-percentage"
|
||||
mthreads:
|
||||
resourceCountName: "mthreads.com/vgpu"
|
||||
resourceMemoryName: "mthreads.com/sgpu-memory"
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
|
|
@ -24,3 +25,4 @@ rules:
|
|||
resourceNames:
|
||||
- {{ include "hami-vgpu.fullname" . }}-admission
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
|
|
@ -15,4 +16,5 @@ roleRef:
|
|||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-create
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
|
|
@ -41,11 +43,11 @@ spec:
|
|||
- --cert-name=tls.crt
|
||||
- --key-name=tls.key
|
||||
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) .Values.scheduler.admissionWebhook.customURL.host}}
|
||||
{{- else }}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }}
|
||||
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) }}
|
||||
{{- end }}
|
||||
- --namespace={{ .Release.Namespace }}
|
||||
- --namespace={{ include "hami-vgpu.namespace" . }}
|
||||
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
|
||||
restartPolicy: OnFailure
|
||||
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
|
|
@ -58,3 +60,4 @@ spec:
|
|||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
annotations:
|
||||
"helm.sh/hook": post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
|
|
@ -39,7 +41,7 @@ spec:
|
|||
args:
|
||||
- patch
|
||||
- --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
|
||||
- --namespace={{ .Release.Namespace }}
|
||||
- --namespace={{ include "hami-vgpu.namespace" . }}
|
||||
- --patch-validating=false
|
||||
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
|
||||
restartPolicy: OnFailure
|
||||
|
|
@ -53,3 +55,4 @@ spec:
|
|||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
{{- if .Values.podSecurityPolicy.enabled }}
|
||||
apiVersion: policy/v1beta1
|
||||
kind: PodSecurityPolicy
|
||||
|
|
@ -34,3 +35,4 @@ spec:
|
|||
- secret
|
||||
- downwardAPI
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
|
|
@ -16,3 +18,4 @@ rules:
|
|||
verbs:
|
||||
- get
|
||||
- create
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
|
|
@ -15,4 +17,5 @@ roleRef:
|
|||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,10 +1,13 @@
|
|||
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.fullname" . }}-admission
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
annotations:
|
||||
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
labels:
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -12,4 +12,4 @@ roleRef:
|
|||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ apiVersion: v1
|
|||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: hami-scheduler
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
@ -23,7 +24,7 @@ spec:
|
|||
protocol: TCP
|
||||
- name: monitor
|
||||
port: {{ .Values.scheduler.service.monitorPort | default 31993 }} # Default monitoring port is 31993
|
||||
targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 31993 }}
|
||||
targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }}
|
||||
{{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort
|
||||
nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ apiVersion: v1
|
|||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace | quote }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
labels:
|
||||
app.kubernetes.io/component: "hami-scheduler"
|
||||
{{- include "hami-vgpu.labels" . | nindent 4 }}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
apiVersion: admissionregistration.k8s.io/v1
|
||||
kind: MutatingWebhookConfiguration
|
||||
metadata:
|
||||
{{- if .Values.scheduler.certManager.enabled }}
|
||||
annotations:
|
||||
cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert
|
||||
{{- end }}
|
||||
name: {{ include "hami-vgpu.scheduler.webhook" . }}
|
||||
webhooks:
|
||||
- admissionReviewVersions:
|
||||
|
|
@ -11,7 +15,7 @@ webhooks:
|
|||
{{- else }}
|
||||
service:
|
||||
name: {{ include "hami-vgpu.scheduler" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
namespace: {{ include "hami-vgpu.namespace" . }}
|
||||
path: /webhook
|
||||
port: {{ .Values.scheduler.service.httpPort }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
nameOverride: ""
|
||||
fullnameOverride: ""
|
||||
namespaceOverride: ""
|
||||
imagePullSecrets: []
|
||||
version: "v2.5.2"
|
||||
version: "v2.5.2-share-01"
|
||||
|
||||
# Nvidia GPU Parameters
|
||||
resourceName: "nvidia.com/gpu"
|
||||
|
|
@ -106,7 +107,7 @@ scheduler:
|
|||
enabled: false
|
||||
# must be an endpoint using https.
|
||||
# should generate host certs here
|
||||
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
|
||||
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
|
||||
port: 31998
|
||||
path: /webhook
|
||||
whitelistNamespaces:
|
||||
|
|
@ -116,7 +117,14 @@ scheduler:
|
|||
# - istio-system
|
||||
reinvocationPolicy: Never
|
||||
failurePolicy: Ignore
|
||||
## TLS Certificate Option 1: Use cert-manager to generate self-signed certificate.
|
||||
## If enabled, always takes precedence over options 2.
|
||||
certManager:
|
||||
enabled: false
|
||||
## TLS Certificate Option 2: Use kube-webhook-certgen to generate self-signed certificate.
|
||||
## If true and certManager.enabled is false, Helm will automatically create a self-signed cert and secret for you.
|
||||
patch:
|
||||
enabled: true
|
||||
image: jettech/kube-webhook-certgen:v1.5.2
|
||||
imageNew: liangjw/kube-webhook-certgen:v1.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
|
@ -130,6 +138,7 @@ scheduler:
|
|||
httpPort: 443 # HTTP port
|
||||
schedulerPort: 31998 # NodePort for HTTP
|
||||
monitorPort: 31993 # Monitoring port
|
||||
monitorTargetPort: 9395
|
||||
labels: {}
|
||||
annotations: {}
|
||||
|
||||
|
|
@ -138,25 +147,31 @@ devicePlugin:
|
|||
monitorimage: "beclab/hami"
|
||||
monitorctrPath: /usr/local/vgpu/containers
|
||||
imagePullPolicy: IfNotPresent
|
||||
deviceSplitCount: 100
|
||||
deviceMemoryScaling: 100
|
||||
deviceCoreScaling: 100
|
||||
deviceSplitCount: 10
|
||||
deviceMemoryScaling: 1
|
||||
deviceCoreScaling: 1
|
||||
# The runtime class name to be used by the device plugin, and added to the pod.spec.runtimeClassName of applications utilizing NVIDIA GPUs
|
||||
runtimeClassName: ""
|
||||
# Whether to create runtime class, name comes from runtimeClassName when it is set
|
||||
createRuntimeClass: false
|
||||
migStrategy: "none"
|
||||
disablecorelimit: "false"
|
||||
passDeviceSpecsEnabled: false
|
||||
extraArgs:
|
||||
- -v=4
|
||||
|
||||
service:
|
||||
type: ClusterIP # Default type is NodePort, can be changed to ClusterIP
|
||||
httpPort: 31992
|
||||
labels: {}
|
||||
annotations: {}
|
||||
|
||||
pluginPath: /var/lib/kubelet/device-plugins
|
||||
libPath: /usr/local/vgpu
|
||||
|
||||
podAnnotations: {}
|
||||
nvidianodeSelector:
|
||||
gpu.bytetrade.io/cuda-supported: 'true'
|
||||
gpu.bytetrade.io/cuda-supported: "true"
|
||||
tolerations: []
|
||||
# The updateStrategy for DevicePlugin DaemonSet.
|
||||
# If you want to update the DaemonSet by manual, set type as "OnDelete".
|
||||
|
|
@ -189,6 +204,11 @@ devicePlugin:
|
|||
# memory: 100Mi
|
||||
|
||||
devices:
|
||||
enflame:
|
||||
enabled: false
|
||||
customresources:
|
||||
- enflame.com/vgcu
|
||||
- enflame.com/vgcu-percentage
|
||||
mthreads:
|
||||
enabled: false
|
||||
customresources:
|
||||
|
|
@ -271,13 +291,13 @@ dcgmExporter:
|
|||
|
||||
# Annotations to be added to dcgm-exporter pods
|
||||
podAnnotations: {}
|
||||
# Using this annotation which is required for prometheus scraping
|
||||
# Using this annotation which is required for prometheus scraping
|
||||
# prometheus.io/scrape: "true"
|
||||
# prometheus.io/port: "9400"
|
||||
# prometheus.io/port: "9400"
|
||||
|
||||
# The SecurityContext for the dcgm-exporter pods
|
||||
podSecurityContext: {}
|
||||
# fsGroup: 2000
|
||||
# fsGroup: 2000
|
||||
|
||||
# The SecurityContext for the dcgm-exporter containers
|
||||
securityContext:
|
||||
|
|
@ -305,24 +325,24 @@ dcgmExporter:
|
|||
# memory: 128Mi
|
||||
# requests:
|
||||
# cpu: 100m
|
||||
# memory: 128Mi
|
||||
# memory: 128Mi
|
||||
serviceMonitor:
|
||||
apiVersion: "monitoring.coreos.com/v1"
|
||||
enabled: true
|
||||
interval: 15s
|
||||
honorLabels: false
|
||||
additionalLabels: {}
|
||||
# monitoring: prometheus
|
||||
# monitoring: prometheus
|
||||
relabelings: []
|
||||
# - sourceLabels: [__meta_kubernetes_pod_node_name]
|
||||
# separator: ;
|
||||
# regex: ^(.*)$
|
||||
# targetLabel: nodename
|
||||
# replacement: $1
|
||||
# action: replace
|
||||
# action: replace
|
||||
|
||||
nodeSelector: {}
|
||||
# node: gpu
|
||||
# node: gpu
|
||||
|
||||
tolerations: []
|
||||
# - operator: Exists
|
||||
|
|
@ -333,7 +353,7 @@ dcgmExporter:
|
|||
# nodeSelectorTerms:
|
||||
# - matchExpressions:
|
||||
# - key: nvidia-gpu
|
||||
# operator: Exists
|
||||
# operator: Exists
|
||||
|
||||
extraHostVolumes: []
|
||||
# - name: host-binaries
|
||||
|
|
@ -344,8 +364,8 @@ dcgmExporter:
|
|||
configMap:
|
||||
name: exporter-metrics-config-map
|
||||
items:
|
||||
- key: metrics
|
||||
path: default-counters.csv
|
||||
- key: metrics
|
||||
path: default-counters.csv
|
||||
|
||||
extraVolumeMounts:
|
||||
- name: exporter-metrics-volume
|
||||
|
|
@ -391,9 +411,9 @@ dcgmExporter:
|
|||
# Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
|
||||
users: {}
|
||||
|
||||
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
|
||||
# Must be the complete list and is not additive. If unset, the default list will take effect.
|
||||
# customMetrics: |
|
||||
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
|
||||
# Must be the complete list and is not additive. If unset, the default list will take effect.
|
||||
# customMetrics: |
|
||||
# Format
|
||||
# If line starts with a '#' it is considered a comment
|
||||
# DCGM FIELD, Prometheus metric type, help message
|
||||
|
|
@ -442,7 +462,7 @@ webui:
|
|||
# drop:
|
||||
# - ALL
|
||||
# readOnlyRootFilesystem: true
|
||||
# runAsNonRoot: true
|
||||
# runAsNonRoot: true
|
||||
# runAsUser: 1000
|
||||
|
||||
service:
|
||||
|
|
@ -453,7 +473,7 @@ webui:
|
|||
enabled: false
|
||||
className: ""
|
||||
annotations: {}
|
||||
# kubernetes.io/ingress.class: nginx
|
||||
# kubernetes.io/ingress.class: nginx
|
||||
# kubernetes.io/tls-acme: "true"
|
||||
hosts:
|
||||
- host: chart-example.local
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ target: prebuilt
|
|||
output:
|
||||
containers:
|
||||
-
|
||||
name: beclab/hami:v2.5.2
|
||||
name: beclab/hami:v2.5.2-share-01
|
||||
-
|
||||
name: projecthami/hami-webui-fe-oss:v1.0.5
|
||||
-
|
||||
|
|
|
|||
Loading…
Reference in a new issue