feat: schedule/allocate pod by gpu bindings and different share modes (#1363)

2026-05-24 09:18:23 +00:00 · 2025-05-29 20:24:53 +08:00 · 2025-05-29 20:24:53 +08:00 · 0c5a80653e
commit 0c5a80653e
parent e58743fa87
29 changed files with 285 additions and 54 deletions
--- a/framework/gpu/.olares/config/gpu/hami/Chart.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/Chart.yaml
@ -1,6 +1,6 @@
 apiVersion: v2
 name: hami
-version: 2.5.0
+version: 2.5.1
 kubeVersion: ">= 1.16.0"
 description: Heterogeneous AI Computing Virtualization Middleware
 keywords:
@ -12,4 +12,4 @@ maintainers:
    email: limengxuan@4paradigm.com
  - name: zhangxiao
    email: xiaozhang0210@hotmail.com
-appVersion: "2.5.0"
+appVersion: "2.5.1"
--- a/framework/gpu/.olares/config/gpu/hami/crds/gpu.bytetrade.io_gpubindings.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/crds/gpu.bytetrade.io_gpubindings.yaml
@ -0,0 +1,103 @@
+---
+apiVersion: apiextensions.k8s.io/v1
+kind: CustomResourceDefinition
+metadata:
+  annotations:
+    controller-gen.kubebuilder.io/version: v0.17.2
+  name: gpubindings.gpu.bytetrade.io
+spec:
+  group: gpu.bytetrade.io
+  names:
+    kind: GPUBinding
+    listKind: GPUBindingList
+    plural: gpubindings
+    singular: gpubinding
+  scope: Cluster
+  versions:
+  - name: v1alpha1
+    schema:
+      openAPIV3Schema:
+        properties:
+          apiVersion:
+            description: |-
+              APIVersion defines the versioned schema of this representation of an object.
+              Servers should convert recognized schemas to the latest internal value, and
+              may reject unrecognized values.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources
+            type: string
+          kind:
+            description: |-
+              Kind is a string value representing the REST resource this object represents.
+              Servers may infer this from the endpoint the client submits requests to.
+              Cannot be updated.
+              In CamelCase.
+              More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds
+            type: string
+          metadata:
+            type: object
+          spec:
+            properties:
+              appName:
+                type: string
+              memory:
+                anyOf:
+                - type: integer
+                - type: string
+                pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
+                x-kubernetes-int-or-string: true
+              podSelector:
+                description: |-
+                  A label selector is a label query over a set of resources. The result of matchLabels and
+                  matchExpressions are ANDed. An empty label selector matches all objects. A null
+                  label selector matches no objects.
+                properties:
+                  matchExpressions:
+                    description: matchExpressions is a list of label selector requirements.
+                      The requirements are ANDed.
+                    items:
+                      description: |-
+                        A label selector requirement is a selector that contains values, a key, and an operator that
+                        relates the key and values.
+                      properties:
+                        key:
+                          description: key is the label key that the selector applies
+                            to.
+                          type: string
+                        operator:
+                          description: |-
+                            operator represents a key's relationship to a set of values.
+                            Valid operators are In, NotIn, Exists and DoesNotExist.
+                          type: string
+                        values:
+                          description: |-
+                            values is an array of string values. If the operator is In or NotIn,
+                            the values array must be non-empty. If the operator is Exists or DoesNotExist,
+                            the values array must be empty. This array is replaced during a strategic
+                            merge patch.
+                          items:
+                            type: string
+                          type: array
+                      required:
+                      - key
+                      - operator
+                      type: object
+                    type: array
+                  matchLabels:
+                    additionalProperties:
+                      type: string
+                    description: |-
+                      matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
+                      map is equivalent to an element of matchExpressions, whose key field is "key", the
+                      operator is "In", and the values array contains only "value". The requirements are ANDed.
+                    type: object
+                type: object
+                x-kubernetes-map-type: atomic
+              uuid:
+                type: string
+            required:
+            - appName
+            - uuid
+            type: object
+        type: object
+    served: true
+    storage: true
--- a/framework/gpu/.olares/config/gpu/hami/templates/_helpers.tpl
+++ b/framework/gpu/.olares/config/gpu/hami/templates/_helpers.tpl
@ -23,6 +23,17 @@ If release name contains chart name it will be used as a full name.
 {{- end -}}
 {{- end -}}

+{{/*
+Allow the release namespace to be overridden for multi-namespace deployments in combined charts
+*/}}
+{{- define "hami-vgpu.namespace" -}}
+  {{- if .Values.namespaceOverride -}}
+    {{- .Values.namespaceOverride -}}
+  {{- else -}}
+    {{- .Release.Namespace -}}
+  {{- end -}}
+{{- end -}}
+
 {{/*
 The app name for Scheduler
 */}}
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/configmap.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/configmap.yaml
@ -2,6 +2,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ include "hami-vgpu.device-plugin" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-device-plugin
    {{- include "hami-vgpu.labels" . | nindent 4 }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/daemonsetnvidia.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/daemonsetnvidia.yaml
@ -2,6 +2,7 @@ apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: {{ include "hami-vgpu.device-plugin" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-device-plugin
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -26,8 +27,11 @@ spec:
        app.kubernetes.io/component: hami-device-plugin
        hami.io/webhook: ignore
        {{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
+      annotations: 
+        checksum/hami-device-plugin-config: {{ include (print $.Template.BasePath "/device-plugin/configmap.yaml") . | sha256sum }}
+        checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
      {{- if .Values.devicePlugin.podAnnotations }}
-      annotations: {{ toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
+        {{- toYaml .Values.devicePlugin.podAnnotations | nindent 8 }}
      {{- end }}
    spec:
      {{- if .Values.devicePlugin.runtimeClassName }}
@ -112,12 +116,12 @@ spec:
            - name: NVIDIA_MIG_MONITOR_DEVICES
              value: all
            - name: HOOK_PATH
-              value: '{{ .Values.global.gpuHookPath }}/vgpu'
+              value: {{ .Values.global.gpuHookPath }}/vgpu
          resources:
          {{- toYaml .Values.devicePlugin.vgpuMonitor.resources | nindent 12 }}
          volumeMounts:
            - name: ctrs
-              mountPath: '{{ .Values.devicePlugin.monitorctrPath }}'
+              mountPath: {{ .Values.devicePlugin.monitorctrPath }}
            - name: dockers
              mountPath: /run/docker
            - name: containerds
@ -131,7 +135,7 @@ spec:
      volumes:
        - name: ctrs
          hostPath:
-            path: '{{ .Values.devicePlugin.monitorctrPath }}'
+            path: {{ .Values.devicePlugin.monitorctrPath }}
        - name: hosttmp
          hostPath:
            path: /tmp
@ -143,10 +147,10 @@ spec:
            path: /run/containerd
        - name: device-plugin
          hostPath:
-            path: '{{ .Values.devicePlugin.pluginPath }}'
+            path: {{ .Values.devicePlugin.pluginPath }}
        - name: lib
          hostPath:
-            path: '{{ .Values.devicePlugin.libPath }}'
+            path: {{ .Values.devicePlugin.libPath }}
        - name: usrbin
          hostPath:
            path: /usr/bin
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorrolebinding.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorrolebinding.yaml
@ -13,4 +13,4 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: {{ include "hami-vgpu.device-plugin" . }}
-    namespace: {{ .Release.Namespace | quote }}
+    namespace: {{ include "hami-vgpu.namespace" . }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorservice.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorservice.yaml
@ -2,6 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "hami-vgpu.device-plugin" . }}-monitor
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-device-plugin
    {{- include "hami-vgpu.labels" . | nindent 4 }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorserviceaccount.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/monitorserviceaccount.yaml
@ -2,7 +2,7 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "hami-vgpu.device-plugin" . }}
-  namespace: {{ .Release.Namespace | quote }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: "hami-device-plugin"
    {{- include "hami-vgpu.labels" . | nindent 4 }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/runtime-class.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/device-plugin/runtime-class.yaml
@ -0,0 +1,9 @@
+{{- if and .Values.devicePlugin.createRuntimeClass .Values.devicePlugin.runtimeClassName }}
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: {{ .Values.devicePlugin.runtimeClassName }}
+  annotations:
+    helm.sh/hook: pre-install,pre-upgrade
+handler: nvidia
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/gpu-scheduler/service.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/gpu-scheduler/service.yaml
@ -11,12 +11,5 @@ spec:
      targetPort: 6000
  selector:
    name: gpu-scheduler
-  clusterIP: None
-  clusterIPs:
-    - None
  type: ClusterIP
-  sessionAffinity: None
-  ipFamilies:
-    - IPv4
-  ipFamilyPolicy: SingleStack
-  internalTrafficPolicy: Cluster
+  internalTrafficPolicy: Local
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/certmanager.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/certmanager.yaml
@ -0,0 +1,29 @@
+{{- if .Values.scheduler.certManager.enabled }}
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: {{ include "hami-vgpu.scheduler" . }}-serving-cert
+  namespace: {{ include "hami-vgpu.namespace" . }}
+  labels:
+    app.kubernetes.io/component: hami-scheduler
+    {{- include "hami-vgpu.labels" . | nindent 4 }}
+spec:
+  dnsNames:
+    - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc
+    - {{ include "hami-vgpu.scheduler" . }}.{{ include "hami-vgpu.namespace" . }}.svc.cluster.local
+  issuerRef:
+    kind: Issuer
+    name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
+  secretName: {{ include "hami-vgpu.scheduler.tls" . }}
+---
+apiVersion: cert-manager.io/v1
+kind: Issuer
+metadata:
+  name: {{ include "hami-vgpu.scheduler" . }}-selfsigned-issuer
+  namespace: {{ include "hami-vgpu.namespace" . }}
+  labels:
+    app.kubernetes.io/component: hami-scheduler
+    {{- include "hami-vgpu.labels" . | nindent 4 }}
+spec:
+  selfSigned: {}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/configmap.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/configmap.yaml
@ -3,6 +3,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-scheduler
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -40,6 +41,14 @@ data:
                    },
                    {{- end }}
                    {{- end }}
+                    {{- if .Values.devices.enflame.enabled }}
+                    {{- range .Values.devices.enflame.customresources }}
+                    {
+                      "name": "{{ . }}",
+                      "ignoredByScheduler": true
+                    },
+                    {{- end }}
+                    {{- end }}
                    {
                        "name": "{{ .Values.resourceName }}",
                        "ignoredByScheduler": true
@ -80,6 +89,10 @@ data:
                        "name": "{{ .Values.iluvatarResourceName }}",
                        "ignoredByScheduler": true
                    },
+                    {
+                        "name": "metax-tech.com/gpu",
+                        "ignoredByScheduler": true
+                    },
                    {
                        "name": "{{ .Values.metaxResourceName }}",
                        "ignoredByScheduler": true
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/configmapnew.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/configmapnew.yaml
@ -3,6 +3,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}-newversion
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-scheduler
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -49,6 +50,8 @@ data:
        ignoredByScheduler: true
      - name: {{ .Values.iluvatarResourceName }}
        ignoredByScheduler: true
+      - name: "metax-tech.com/gpu"
+        ignoredByScheduler: true
      - name: {{ .Values.metaxResourceName }}
        ignoredByScheduler: true
      - name: {{ .Values.metaxResourceCore }}
@ -67,4 +70,10 @@ data:
        ignoredByScheduler: true
      {{- end }}
      {{- end }}
+      {{- if .Values.devices.enflame.enabled }}
+      {{- range .Values.devices.enflame.customresources }}
+      - name: {{ . }}
+        ignoredByScheduler: true
+      {{- end }}
+      {{- end }}
 {{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/deployment.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/deployment.yaml
@ -2,6 +2,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-scheduler
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -27,8 +28,15 @@ spec:
        app.kubernetes.io/component: hami-scheduler
        {{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
        hami.io/webhook: ignore
+      annotations:
+        {{- if ge (regexReplaceAll "[^0-9]" .Capabilities.KubeVersion.Minor "" | int) 22 }}
+        checksum/hami-scheduler-newversion-config: {{ include (print $.Template.BasePath "/scheduler/configmapnew.yaml") . | sha256sum }}
+        {{- else }}
+        checksum/hami-scheduler-config: {{ include (print $.Template.BasePath "/scheduler/configmap.yaml") . | sha256sum }}
+        {{- end }}
+        checksum/hami-scheduler-device-config: {{ include (print $.Template.BasePath "/scheduler/device-configmap.yaml") . | sha256sum }}
      {{- if .Values.scheduler.podAnnotations }}
-      annotations: {{ toYaml .Values.scheduler.podAnnotations | nindent 8 }}
+        {{- toYaml .Values.scheduler.podAnnotations | nindent 8 }}
      {{- end }}
    spec:
      {{- include "hami-vgpu.imagePullSecrets" . | nindent 6}}
@ -53,7 +61,7 @@ spec:
            {{- end }}
            - --leader-elect={{ .Values.scheduler.leaderElect }}
            - --leader-elect-resource-name={{ .Values.schedulerName }}
-            - --leader-elect-resource-namespace={{ .Release.Namespace }}
+            - --leader-elect-resource-namespace={{ include "hami-vgpu.namespace" . }}
          resources:
          {{- toYaml .Values.scheduler.kubeScheduler.resources | nindent 12 }}
          volumeMounts:
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/device-configmap.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/device-configmap.yaml
@ -2,6 +2,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}-device
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-scheduler
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -17,13 +18,14 @@ data:
      resourceCoreName: {{ .Values.resourceCores }}
      resourcePriorityName: {{ .Values.resourcePriority }}
      overwriteEnv: false
-      defaultMemory: 16000
+      defaultMemory: 0
      defaultCores: 0
      defaultGPUNum: 1
      deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
      deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
      deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
      gpuCorePolicy: {{ .Values.devices.nvidia.gpuCorePolicy }}
+      runtimeClassName: "{{ .Values.devicePlugin.runtimeClassName }}"
      knownMigGeometries:
      - models: [ "A30" ]
        allowedGeometries:
@ -91,10 +93,12 @@ data:
      resourceCoreName: {{ .Values.dcuResourceCores }}
    metax:
      resourceCountName: "metax-tech.com/gpu"
-
      resourceVCountName: {{ .Values.metaxResourceName }}
      resourceVMemoryName: {{ .Values.metaxResourceMem }}
      resourceVCoreName: {{ .Values.metaxResourceCore }}
+    enflame:
+      resourceCountName: "enflame.com/vgcu"
+      resourcePercentageName: "enflame.com/vgcu-percentage"
    mthreads:
      resourceCountName: "mthreads.com/vgpu"
      resourceMemoryName: "mthreads.com/sgpu-memory"
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/clusterrole.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/clusterrole.yaml
@ -1,3 +1,4 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
@ -24,3 +25,4 @@ rules:
    resourceNames:
    - {{ include "hami-vgpu.fullname" . }}-admission
 {{- end }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/clusterrolebinding.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/clusterrolebinding.yaml
@ -1,3 +1,4 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
@ -15,4 +16,5 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: {{ include "hami-vgpu.fullname" . }}-admission
-    namespace: {{ .Release.Namespace | quote }}
+    namespace: {{ include "hami-vgpu.namespace" . }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/job-createSecret.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/job-createSecret.yaml
@ -1,7 +1,9 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: {{ include "hami-vgpu.fullname" . }}-admission-create
+  namespace: {{ include "hami-vgpu.namespace" . }}
  annotations:
    "helm.sh/hook": pre-install,pre-upgrade
    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -41,11 +43,11 @@ spec:
            - --cert-name=tls.crt
            - --key-name=tls.key
            {{- if .Values.scheduler.admissionWebhook.customURL.enabled }}
-            - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) .Release.Namespace .Values.scheduler.admissionWebhook.customURL.host}}
+            - --host={{ printf "%s.%s.svc,127.0.0.1,%s" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) .Values.scheduler.admissionWebhook.customURL.host}}
            {{- else }}
-            - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) .Release.Namespace }}
+            - --host={{ printf "%s.%s.svc,127.0.0.1" (include "hami-vgpu.scheduler" .) (include "hami-vgpu.namespace" .) }}
            {{- end }}
-            - --namespace={{ .Release.Namespace }}
+            - --namespace={{ include "hami-vgpu.namespace" . }}
            - --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
      restartPolicy: OnFailure
      serviceAccountName: {{ include "hami-vgpu.fullname" . }}-admission
@ -58,3 +60,4 @@ spec:
      securityContext:
        runAsNonRoot: true
        runAsUser: {{ .Values.scheduler.patch.runAsUser }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/job-patchWebhook.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/job-patchWebhook.yaml
@ -1,7 +1,9 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: {{ include "hami-vgpu.fullname" . }}-admission-patch
+  namespace: {{ include "hami-vgpu.namespace" . }}
  annotations:
    "helm.sh/hook": post-install,post-upgrade
    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -39,7 +41,7 @@ spec:
          args:
            - patch
            - --webhook-name={{ include "hami-vgpu.scheduler.webhook" . }}
-            - --namespace={{ .Release.Namespace }}
+            - --namespace={{ include "hami-vgpu.namespace" . }}
            - --patch-validating=false
            - --secret-name={{ include "hami-vgpu.scheduler.tls" . }}
      restartPolicy: OnFailure
@ -53,3 +55,4 @@ spec:
      securityContext:
        runAsNonRoot: true
        runAsUser: {{ .Values.scheduler.patch.runAsUser }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/psp.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/psp.yaml
@ -1,3 +1,4 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 {{- if .Values.podSecurityPolicy.enabled }}
 apiVersion: policy/v1beta1
 kind: PodSecurityPolicy
@ -34,3 +35,4 @@ spec:
  - secret
  - downwardAPI
 {{- end }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/role.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/role.yaml
@ -1,7 +1,9 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name:  {{ include "hami-vgpu.fullname" . }}-admission
+  namespace: {{ include "hami-vgpu.namespace" . }}
  annotations:
    "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -16,3 +18,4 @@ rules:
    verbs:
      - get
      - create
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/rolebinding.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/rolebinding.yaml
@ -1,7 +1,9 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: {{ include "hami-vgpu.fullname" . }}-admission
+  namespace: {{ include "hami-vgpu.namespace" . }}
  annotations:
    "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
@ -15,4 +17,5 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: {{ include "hami-vgpu.fullname" . }}-admission
-    namespace: {{ .Release.Namespace | quote }}
+    namespace: {{ include "hami-vgpu.namespace" . }}
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/serviceaccount.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/job-patch/serviceaccount.yaml
@ -1,10 +1,13 @@
+{{- if and (.Values.scheduler.patch.enabled) (not .Values.scheduler.certManager.enabled) }}
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "hami-vgpu.fullname" . }}-admission
+  namespace: {{ include "hami-vgpu.namespace" . }}
  annotations:
    "helm.sh/hook": pre-install,pre-upgrade,post-install,post-upgrade
    "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
  labels:
    {{- include "hami-vgpu.labels" . | nindent 4 }}
    app.kubernetes.io/component: admission-webhook
+{{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/rolebinding.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/rolebinding.yaml
@ -12,4 +12,4 @@ roleRef:
 subjects:
  - kind: ServiceAccount
    name: {{ include "hami-vgpu.scheduler" . }}
-    namespace: {{ .Release.Namespace | quote }}
+    namespace: {{ include "hami-vgpu.namespace" . }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/service.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/service.yaml
@ -2,6 +2,7 @@ apiVersion: v1
 kind: Service
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: hami-scheduler
    {{- include "hami-vgpu.labels" . | nindent 4 }}
@ -23,7 +24,7 @@ spec:
      protocol: TCP
    - name: monitor
      port: {{ .Values.scheduler.service.monitorPort | default 31993 }}  # Default monitoring port is 31993
-      targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 31993 }}
+      targetPort: {{ .Values.scheduler.service.monitorTargetPort | default 9395 }}
      {{- if eq (.Values.scheduler.service.type | default "NodePort") "NodePort" }}  # If type is NodePort, set nodePort
      nodePort: {{ .Values.scheduler.service.monitorPort | default 31993 }}
      {{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/serviceaccount.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/serviceaccount.yaml
@ -2,7 +2,7 @@ apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: {{ include "hami-vgpu.scheduler" . }}
-  namespace: {{ .Release.Namespace | quote }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
  labels:
    app.kubernetes.io/component: "hami-scheduler"
    {{- include "hami-vgpu.labels" . | nindent 4 }}
--- a/framework/gpu/.olares/config/gpu/hami/templates/scheduler/webhook.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/templates/scheduler/webhook.yaml
@ -1,6 +1,10 @@
 apiVersion: admissionregistration.k8s.io/v1
 kind: MutatingWebhookConfiguration
 metadata:
+  {{- if .Values.scheduler.certManager.enabled }}
+  annotations:
+    cert-manager.io/inject-ca-from: {{ include "hami-vgpu.namespace" . }}/{{ include "hami-vgpu.scheduler" . }}-serving-cert
+  {{- end }}
  name: {{ include "hami-vgpu.scheduler.webhook" . }}
 webhooks:
  - admissionReviewVersions:
@ -11,7 +15,7 @@ webhooks:
      {{- else }}
      service:
        name: {{ include "hami-vgpu.scheduler" . }}
-        namespace: {{ .Release.Namespace }}
+        namespace: {{ include "hami-vgpu.namespace" . }}
        path: /webhook
        port: {{ .Values.scheduler.service.httpPort }}
      {{- end }}
--- a/framework/gpu/.olares/config/gpu/hami/values.yaml
+++ b/framework/gpu/.olares/config/gpu/hami/values.yaml
@ -2,8 +2,9 @@

 nameOverride: ""
 fullnameOverride: ""
+namespaceOverride: ""
 imagePullSecrets: []
-version: "v2.5.2"
+version: "v2.5.2-share-01"

 # Nvidia GPU Parameters
 resourceName: "nvidia.com/gpu"
@ -106,7 +107,7 @@ scheduler:
      enabled: false
      # must be an endpoint using https.
      # should generate host certs here
-      host: 127.0.0.1          # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
+      host: 127.0.0.1  # hostname or ip, can be your node'IP if you want to use https://<nodeIP>:<schedulerPort>/<path>
      port: 31998
      path: /webhook
    whitelistNamespaces:
@ -116,7 +117,14 @@ scheduler:
      # - istio-system
    reinvocationPolicy: Never
    failurePolicy: Ignore
+  ## TLS Certificate Option 1: Use cert-manager to generate self-signed certificate.
+  ## If enabled, always takes precedence over options 2.
+  certManager:
+    enabled: false
+  ## TLS Certificate Option 2: Use kube-webhook-certgen to generate self-signed certificate.
+  ## If true and certManager.enabled is false, Helm will automatically create a self-signed cert and secret for you.
  patch:
+    enabled: true
    image: jettech/kube-webhook-certgen:v1.5.2
    imageNew: liangjw/kube-webhook-certgen:v1.1.1
    imagePullPolicy: IfNotPresent
@ -130,6 +138,7 @@ scheduler:
    httpPort: 443   # HTTP port
    schedulerPort: 31998  # NodePort for HTTP
    monitorPort: 31993    # Monitoring port
+    monitorTargetPort: 9395
    labels: {}
    annotations: {}

@ -138,25 +147,31 @@ devicePlugin:
  monitorimage: "beclab/hami"
  monitorctrPath: /usr/local/vgpu/containers
  imagePullPolicy: IfNotPresent
-  deviceSplitCount: 100
-  deviceMemoryScaling: 100
-  deviceCoreScaling: 100
+  deviceSplitCount: 10
+  deviceMemoryScaling: 1
+  deviceCoreScaling: 1
+  # The runtime class name to be used by the device plugin, and added to the pod.spec.runtimeClassName of applications utilizing NVIDIA GPUs
  runtimeClassName: ""
+  # Whether to create runtime class, name comes from runtimeClassName when it is set
+  createRuntimeClass: false
  migStrategy: "none"
  disablecorelimit: "false"
  passDeviceSpecsEnabled: false
  extraArgs:
    - -v=4
+
  service:
    type: ClusterIP  # Default type is NodePort, can be changed to ClusterIP
    httpPort: 31992
    labels: {}
    annotations: {}
+
  pluginPath: /var/lib/kubelet/device-plugins
  libPath: /usr/local/vgpu
+
  podAnnotations: {}
  nvidianodeSelector:
-    gpu.bytetrade.io/cuda-supported: 'true'
+    gpu.bytetrade.io/cuda-supported: "true"
  tolerations: []
  # The updateStrategy for DevicePlugin DaemonSet.
  # If you want to update the DaemonSet by manual, set type as "OnDelete".
@ -189,6 +204,11 @@ devicePlugin:
 #        memory: 100Mi

 devices:
+  enflame:
+    enabled: false
+    customresources:
+      - enflame.com/vgcu
+      - enflame.com/vgcu-percentage
  mthreads:
    enabled: false
    customresources:
@ -271,13 +291,13 @@ dcgmExporter:

  # Annotations to be added to dcgm-exporter pods
  podAnnotations: {}
-  # Using this annotation which is required for prometheus scraping
+    # Using this annotation which is required for prometheus scraping
    # prometheus.io/scrape: "true"
-    # prometheus.io/port: "9400"
+  # prometheus.io/port: "9400"

  # The SecurityContext for the dcgm-exporter pods
  podSecurityContext: {}
-    # fsGroup: 2000
+  # fsGroup: 2000

  # The SecurityContext for the dcgm-exporter containers
  securityContext:
@ -305,24 +325,24 @@ dcgmExporter:
    #   memory: 128Mi
    # requests:
    #   cpu: 100m
-    #   memory: 128Mi
+  #   memory: 128Mi
  serviceMonitor:
    apiVersion: "monitoring.coreos.com/v1"
    enabled: true
    interval: 15s
    honorLabels: false
    additionalLabels: {}
-      # monitoring: prometheus
+    # monitoring: prometheus
    relabelings: []
      # - sourceLabels: [__meta_kubernetes_pod_node_name]
      #   separator: ;
      #   regex: ^(.*)$
      #   targetLabel: nodename
      #   replacement: $1
-      #   action: replace
+    #   action: replace

  nodeSelector: {}
-    # node: gpu
+  # node: gpu

  tolerations: []
  # - operator: Exists
@ -333,7 +353,7 @@ dcgmExporter:
    #    nodeSelectorTerms:
    #    - matchExpressions:
    #      - key: nvidia-gpu
-    #        operator: Exists
+  #        operator: Exists

  extraHostVolumes: []
  # - name: host-binaries
@ -344,8 +364,8 @@ dcgmExporter:
      configMap:
        name: exporter-metrics-config-map
        items:
-        - key: metrics
-          path: default-counters.csv
+          - key: metrics
+            path: default-counters.csv

  extraVolumeMounts:
    - name: exporter-metrics-volume
@ -391,9 +411,9 @@ dcgmExporter:
    # Object containing <user>:<passwords> key-value pairs for each user that will have access via basic authentication
    users: {}

-  # Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
-  # Must be the complete list and is not additive. If unset, the default list will take effect.
-  # customMetrics: |
+    # Customized list of metrics to emit. Expected to be in the same format (CSV) as the default list.
+    # Must be the complete list and is not additive. If unset, the default list will take effect.
+    # customMetrics: |
    # Format
    # If line starts with a '#' it is considered a comment
    # DCGM FIELD, Prometheus metric type, help message
@ -442,7 +462,7 @@ webui:
    #   drop:
    #   - ALL
    # readOnlyRootFilesystem: true
-    # runAsNonRoot: true
+  # runAsNonRoot: true
  # runAsUser: 1000

  service:
@ -453,7 +473,7 @@ webui:
    enabled: false
    className: ""
    annotations: {}
-      # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/ingress.class: nginx
    # kubernetes.io/tls-acme: "true"
    hosts:
      - host: chart-example.local
--- a/platform/hami/.olares/Olares.yaml
+++ b/platform/hami/.olares/Olares.yaml
@ -3,7 +3,7 @@ target: prebuilt
 output:
  containers:
    - 
-      name: beclab/hami:v2.5.2
+      name: beclab/hami:v2.5.2-share-01
    - 
      name: projecthami/hami-webui-fe-oss:v1.0.5
    -