feat: schedule/allocate pod by gpu bindings and different share modes (#1363)

This commit is contained in:
dkeven 2025-05-29 20:24:53 +08:00 committed by GitHub
parent e58743fa87
commit 0c5a80653e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 285 additions and 54 deletions

View file

@ -1,6 +1,6 @@
apiVersion: v2
name: hami
version: 2.5.0
version: 2.5.1
kubeVersion: ">= 1.16.0"
description: Heterogeneous AI Computing Virtualization Middleware
keywords:
@ -12,4 +12,4 @@ maintainers:
email: limengxuan@4paradigm.com
- name: zhangxiao
email: xiaozhang0210@hotmail.com
appVersion: "2.5.0"
appVersion: "2.5.1"

View file

@ -0,0 +1,103 @@
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.17.2
name: gpubindings.gpu.bytetrade.io
spec:
group: gpu.bytetrade.io
names:
kind: GPUBinding
listKind: GPUBindingList
plural: gpubindings
singular: gpubinding
scope: Cluster
versions:
- name: v1alpha1
schema:
openAPIV3Schema:
properties:
apiVersion:
description: |-
APIVersion defines the versioned schema of this representation of an object.
Servers should convert recognized schemas to the latest internal value, and
may reject unrecognized values.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
type: string
kind:
description: |-
Kind is a string value representing the REST resource this object represents.
Servers may infer this from the endpoint the client submits requests to.
Cannot be updated.
In CamelCase.
More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
type: string
metadata:
type: object
spec:
properties:
appName:
type: string
memory:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
podSelector:
description: |-
A label selector is a label query over a set of resources. The result of matchLabels and
matchExpressions are ANDed. An empty label selector matches all objects. A null
label selector matches no objects.
properties:
matchExpressions:
description: matchExpressions is a list of label selector requirements.
The requirements are ANDed.
items:
description: |-
A label selector requirement is a selector that contains values, a key, and an operator that
relates the key and values.
properties:
key:
description: key is the label key that the selector applies
to.
type: string
operator:
description: |-
operator represents a key's relationship to a set of values.
Valid operators are In, NotIn, Exists and DoesNotExist.
type: string
values:
description: |-
values is an array of string values. If the operator is In or NotIn,
the values array must be non-empty. If the operator is Exists or DoesNotExist,
the values array must be empty. This array is replaced during a strategic
merge patch.
items:
type: string
type: array
required:
- key
- operator
type: object
type: array
matchLabels:
additionalProperties:
type: string
description: |-
matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
map is equivalent to an element of matchExpressions, whose key field is "key", the
operator is "In", and the values array contains only "value". The requirements are ANDed.
type: object
type: object
x-kubernetes-map-type: atomic
uuid:
type: string
required:
- appName
- uuid
type: object
type: object
served: true
storage: true

View file

@ -23,6 +23,17 @@ If release name contains chart name it will be used as a full name.
{{- end -}}
{{- end -}}
{{/*
Allow the release namespace to be overridden for multi-namespace deployments in combined charts
*/}}
{{- define "hami-vgpu.namespace" -}}
{{- if .Values.namespaceOverride -}}
{{- .Values.namespaceOverride -}}
{{- else -}}
{{- .Release.Namespace -}}
{{- end -}}
{{- end -}}
{{/*
The app name for Scheduler
*/}}

View file

@ -2,6 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}

View file

@ -2,6 +2,7 @@ apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -26,8 +27,11 @@ spec:
app.kubernetes.io/component: hami-device-plugin
hami.io/webhook: ignore
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
annotations:
checksum/hami-device-plugin-config: {{ include (print $.Template.BasePath "/device-plugin/configmap.yaml") . | sha256sum }}
checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
{{- if .Values.devicePlugin.podAnnotations }}
annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
{{- toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
{{- end }}
spec:
{{- if .Values.devicePlugin.runtimeClassName }}
@ -112,12 +116,12 @@ spec:
- name: NVIDIA_MIG_MONITOR_DEVICES
value: all
- name: HOOK_PATH
value: '{{ .Values.global.gpuHookPath }}/vgpu'
value: {{ .Values.global.gpuHookPath }}/vgpu
resources:
{{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }}
volumeMounts:
- name: ctrs
mountPath: '{{ .Values.devicePlugin.monitorctrPath }}'
mountPath: {{ .Values.devicePlugin.monitorctrPath }}
- name: dockers
mountPath: /run/docker
- name: containerds
@ -131,7 +135,7 @@ spec:
volumes:
- name: ctrs
hostPath:
path: '{{ .Values.devicePlugin.monitorctrPath }}'
path: {{ .Values.devicePlugin.monitorctrPath }}
- name: hosttmp
hostPath:
path: /tmp
@ -143,10 +147,10 @@ spec:
path: /run/containerd
- name: device-plugin
hostPath:
path: '{{ .Values.devicePlugin.pluginPath }}'
path: {{ .Values.devicePlugin.pluginPath }}
- name: lib
hostPath:
path: '{{ .Values.devicePlugin.libPath }}'
path: {{ .Values.devicePlugin.libPath }}
- name: usrbin
hostPath:
path: /usr/bin

View file

@ -13,4 +13,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}

View file

@ -2,6 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}-monitor
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-device-plugin
{{- include "hami-vgpu.labels" . | nindent 4 }}

View file

@ -2,7 +2,7 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.device-plugin" . }}
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: "hami-device-plugin"
{{- include "hami-vgpu.labels" . | nindent 4 }}

View file

@ -0,0 +1,9 @@
{{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }}
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: {{ .Values.devicePlugin.runtimeClassName }}
annotations:
helm.sh/hook: pre-install,pre-upgrade
handler: nvidia
{{- end }}

View file

@ -11,12 +11,5 @@ spec:
targetPort: 6000
selector:
name: gpu-scheduler
clusterIP: None
clusterIPs:
- None
type: ClusterIP
sessionAffinity: None
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
internalTrafficPolicy: Cluster
internalTrafficPolicy: Local

View file

@ -0,0 +1,29 @@
{{- if .Values.scheduler.certManager.enabled }}
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-serving-cert
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
spec:
dnsNames:
- {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc
- {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local
issuerRef:
kind: Issuer
name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
secretName: {{ include "hami-vgpu.scheduler.tls" . }}
---
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
spec:
selfSigned: {}
{{- end }}

View file

@ -3,6 +3,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -40,6 +41,14 @@ data:
},
{{- end }}
{{- end }}
{{- if .Values.devices.enflame.enabled }}
{{- range .Values.devices.enflame.customresources }}
{
"name": "{{ . }}",
"ignoredByScheduler": true
},
{{- end }}
{{- end }}
{
"name": "{{ .Values.resourceName }}",
"ignoredByScheduler": true
@ -80,6 +89,10 @@ data:
"name": "{{ .Values.iluvatarResourceName }}",
"ignoredByScheduler": true
},
{
"name": "metax-tech.com/gpu",
"ignoredByScheduler": true
},
{
"name": "{{ .Values.metaxResourceName }}",
"ignoredByScheduler": true

View file

@ -3,6 +3,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-newversion
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -49,6 +50,8 @@ data:
ignoredByScheduler: true
- name: {{ .Values.iluvatarResourceName }}
ignoredByScheduler: true
- name: "metax-tech.com/gpu"
ignoredByScheduler: true
- name: {{ .Values.metaxResourceName }}
ignoredByScheduler: true
- name: {{ .Values.metaxResourceCore }}
@ -67,4 +70,10 @@ data:
ignoredByScheduler: true
{{- end }}
{{- end }}
{{- if .Values.devices.enflame.enabled }}
{{- range .Values.devices.enflame.customresources }}
- name: {{ . }}
ignoredByScheduler: true
{{- end }}
{{- end }}
{{- end }}

View file

@ -2,6 +2,7 @@ apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -27,8 +28,15 @@ spec:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
hami.io/webhook: ignore
annotations:
{{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }}
checksum/hami-scheduler-newversion-config: {{ include (print $.Template.BasePath "/scheduler/configmapnew.yaml") . | sha256sum }}
{{- else }}
checksum/hami-scheduler-config: {{ include (print $.Template.BasePath "/scheduler/configmap.yaml") . | sha256sum }}
{{- end }}
checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
{{- if .Values.scheduler.podAnnotations }}
annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }}
{{- toYaml .Values.scheduler.podAnnotations | nindent 8 }}
{{- end }}
spec:
{{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
@ -53,7 +61,7 @@ spec:
{{- end }}
- --leader-elect={{ .Values.scheduler.leaderElect }}
- --leader-elect-resource-name={{ .Values.schedulerName }}
- --leader-elect-resource-namespace={{ .Release.Namespace }}
- --leader-elect-resource-namespace={{ include "hami-vgpu.namespace" . }}
resources:
{{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }}
volumeMounts:

View file

@ -2,6 +2,7 @@ apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "hami-vgpu.scheduler" . }}-device
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -17,13 +18,14 @@ data:
resourceCoreName: {{ .Values.resourceCores }}
resourcePriorityName: {{ .Values.resourcePriority }}
overwriteEnv: false
defaultMemory: 16000
defaultMemory: 0
defaultCores: 0
defaultGPUNum: 1
deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
gpuCorePolicy: {{ .Values.devices.nvidia.gpuCorePolicy }}
runtimeClassName: "{{ .Values.devicePlugin.runtimeClassName }}"
knownMigGeometries:
- models: [ "A30" ]
allowedGeometries:
@ -91,10 +93,12 @@ data:
resourceCoreName: {{ .Values.dcuResourceCores }}
metax:
resourceCountName: "metax-tech.com/gpu"
resourceVCountName: {{ .Values.metaxResourceName }}
resourceVMemoryName: {{ .Values.metaxResourceMem }}
resourceVCoreName: {{ .Values.metaxResourceCore }}
enflame:
resourceCountName: "enflame.com/vgcu"
resourcePercentageName: "enflame.com/vgcu-percentage"
mthreads:
resourceCountName: "mthreads.com/vgpu"
resourceMemoryName: "mthreads.com/sgpu-memory"

View file

@ -1,3 +1,4 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
@ -24,3 +25,4 @@ rules:
resourceNames:
- {{ include "hami-vgpu.fullname" . }}-admission
{{- end }}
{{- end }}

View file

@ -1,3 +1,4 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
@ -15,4 +16,5 @@ roleRef:
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}
{{- end }}

View file

@ -1,7 +1,9 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-create
namespace: {{ include "hami-vgpu.namespace" . }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -41,11 +43,11 @@ spec:
- --cert-name=tls.crt
- --key-name=tls.key
{{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}}
- --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) .Values.scheduler.admissionWebhook.customURL.host}}
{{- else }}
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }}
- --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) }}
{{- end }}
- --namespace={{ .Release.Namespace }}
- --namespace={{ include "hami-vgpu.namespace" . }}
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
restartPolicy: OnFailure
serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
@ -58,3 +60,4 @@ spec:
securityContext:
runAsNonRoot: true
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
{{- end }}

View file

@ -1,7 +1,9 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission-patch
namespace: {{ include "hami-vgpu.namespace" . }}
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -39,7 +41,7 @@ spec:
args:
- patch
- --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
- --namespace={{ .Release.Namespace }}
- --namespace={{ include "hami-vgpu.namespace" . }}
- --patch-validating=false
- --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
restartPolicy: OnFailure
@ -53,3 +55,4 @@ spec:
securityContext:
runAsNonRoot: true
runAsUser: {{ .Values.scheduler.patch.runAsUser }}
{{- end }}

View file

@ -1,3 +1,4 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
{{- if .Values.podSecurityPolicy.enabled }}
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
@ -34,3 +35,4 @@ spec:
- secret
- downwardAPI
{{- end }}
{{- end }}

View file

@ -1,7 +1,9 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ include "hami-vgpu.namespace" . }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -16,3 +18,4 @@ rules:
verbs:
- get
- create
{{- end }}

View file

@ -1,7 +1,9 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ include "hami-vgpu.namespace" . }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -15,4 +17,5 @@ roleRef:
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}
{{- end }}

View file

@ -1,10 +1,13 @@
{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.fullname" . }}-admission
namespace: {{ include "hami-vgpu.namespace" . }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
labels:
{{- include "hami-vgpu.labels" . | nindent 4 }}
app.kubernetes.io/component: admission-webhook
{{- end }}

View file

@ -12,4 +12,4 @@ roleRef:
subjects:
- kind: ServiceAccount
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}

View file

@ -2,6 +2,7 @@ apiVersion: v1
kind: Service
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: hami-scheduler
{{- include "hami-vgpu.labels" . | nindent 4 }}
@ -23,7 +24,7 @@ spec:
protocol: TCP
- name: monitor
port: {{ .Values.scheduler.service.monitorPort | default 31993 }} # Default monitoring port is 31993
targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 31993 }}
targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }}
{{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }} # If type is NodePort, set nodePort
nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }}
{{- end }}

View file

@ -2,7 +2,7 @@ apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace | quote }}
namespace: {{ include "hami-vgpu.namespace" . }}
labels:
app.kubernetes.io/component: "hami-scheduler"
{{- include "hami-vgpu.labels" . | nindent 4 }}

View file

@ -1,6 +1,10 @@
apiVersion: admissionregistration.k8s.io/v1
kind: MutatingWebhookConfiguration
metadata:
{{- if .Values.scheduler.certManager.enabled }}
annotations:
cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert
{{- end }}
name: {{ include "hami-vgpu.scheduler.webhook" . }}
webhooks:
- admissionReviewVersions:
@ -11,7 +15,7 @@ webhooks:
{{- else }}
service:
name: {{ include "hami-vgpu.scheduler" . }}
namespace: {{ .Release.Namespace }}
namespace: {{ include "hami-vgpu.namespace" . }}
path: /webhook
port: {{ .Values.scheduler.service.httpPort }}
{{- end }}

View file

@ -2,8 +2,9 @@
nameOverride: ""
fullnameOverride: ""
namespaceOverride: ""
imagePullSecrets: []
version: "v2.5.2"
version: "v2.5.2-share-01"
# Nvidia GPU Parameters
resourceName: "nvidia.com/gpu"
@ -106,7 +107,7 @@ scheduler:
enabled: false
# must be an endpoint using https.
# should generate host certs here
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
host: 127.0.0.1 # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
port: 31998
path: /webhook
whitelistNamespaces:
@ -116,7 +117,14 @@ scheduler:
# - istio-system
reinvocationPolicy: Never
failurePolicy: Ignore
## TLS Certificate Option 1: Use cert-manager to generate self-signed certificate.
## If enabled, always takes precedence over options 2.
certManager:
enabled: false
## TLS Certificate Option 2: Use kube-webhook-certgen to generate self-signed certificate.
## If true and certManager.enabled is false, Helm will automatically create a self-signed cert and secret for you.
patch:
enabled: true
image: jettech/kube-webhook-certgen:v1.5.2
imageNew: liangjw/kube-webhook-certgen:v1.1.1
imagePullPolicy: IfNotPresent
@ -130,6 +138,7 @@ scheduler:
httpPort: 443 # HTTP port
schedulerPort: 31998 # NodePort for HTTP
monitorPort: 31993 # Monitoring port
monitorTargetPort: 9395
labels: {}
annotations: {}
@ -138,25 +147,31 @@ devicePlugin:
monitorimage: "beclab/hami"
monitorctrPath: /usr/local/vgpu/containers
imagePullPolicy: IfNotPresent
deviceSplitCount: 100
deviceMemoryScaling: 100
deviceCoreScaling: 100
deviceSplitCount: 10
deviceMemoryScaling: 1
deviceCoreScaling: 1
# The runtime class name to be used by the device plugin, and added to the pod.spec.runtimeClassName of applications utilizing NVIDIA GPUs
runtimeClassName: ""
# Whether to create runtime class, name comes from runtimeClassName when it is set
createRuntimeClass: false
migStrategy: "none"
disablecorelimit: "false"
passDeviceSpecsEnabled: false
extraArgs:
- -v=4
service:
type: ClusterIP # Default type is NodePort, can be changed to ClusterIP
httpPort: 31992
labels: {}
annotations: {}
pluginPath: /var/lib/kubelet/device-plugins
libPath: /usr/local/vgpu
podAnnotations: {}
nvidianodeSelector:
gpu.bytetrade.io/cuda-supported: 'true'
gpu.bytetrade.io/cuda-supported: "true"
tolerations: []
# The updateStrategy for DevicePlugin DaemonSet.
# If you want to update the DaemonSet by manual, set type as "OnDelete".
@ -189,6 +204,11 @@ devicePlugin:
# memory: 100Mi
devices:
enflame:
enabled: false
customresources:
- enflame.com/vgcu
- enflame.com/vgcu-percentage
mthreads:
enabled: false
customresources:
@ -271,13 +291,13 @@ dcgmExporter:
# Annotations to be added to dcgm-exporter pods
podAnnotations: {}
# Using this annotation which is required for prometheus scraping
# Using this annotation which is required for prometheus scraping
# prometheus.io/scrape: "true"
# prometheus.io/port: "9400"
# prometheus.io/port: "9400"
# The SecurityContext for the dcgm-exporter pods
podSecurityContext: {}
# fsGroup: 2000
# fsGroup: 2000
# The SecurityContext for the dcgm-exporter containers
securityContext:
@ -305,24 +325,24 @@ dcgmExporter:
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# memory: 128Mi
serviceMonitor:
apiVersion: "monitoring.coreos.com/v1"
enabled: true
interval: 15s
honorLabels: false
additionalLabels: {}
# monitoring: prometheus
# monitoring: prometheus
relabelings: []
# - sourceLabels: [__meta_kubernetes_pod_node_name]
# separator: ;
# regex: ^(.*)$
# targetLabel: nodename
# replacement: $1
# action: replace
# action: replace
nodeSelector: {}
# node: gpu
# node: gpu
tolerations: []
# - operator: Exists
@ -333,7 +353,7 @@ dcgmExporter:
# nodeSelectorTerms:
# - matchExpressions:
# - key: nvidia-gpu
# operator: Exists
# operator: Exists
extraHostVolumes: []
# - name: host-binaries
@ -344,8 +364,8 @@ dcgmExporter:
configMap:
name: exporter-metrics-config-map
items:
- key: metrics
path: default-counters.csv
- key: metrics
path: default-counters.csv
extraVolumeMounts:
- name: exporter-metrics-volume
@ -391,9 +411,9 @@ dcgmExporter:
# Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
users: {}
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
# Must be the complete list and is not additive. If unset, the default list will take effect.
# customMetrics: |
# Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
# Must be the complete list and is not additive. If unset, the default list will take effect.
# customMetrics: |
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message
@ -442,7 +462,7 @@ webui:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsNonRoot: true
# runAsUser: 1000
service:
@ -453,7 +473,7 @@ webui:
enabled: false
className: ""
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: chart-example.local

View file

@ -3,7 +3,7 @@ target: prebuilt
output:
containers:
-
name: beclab/hami:v2.5.2
name: beclab/hami:v2.5.2-share-01
-
name: projecthami/hami-webui-fe-oss:v1.0.5
-