Olares/cli/pkg/amdgpu/module.go
eball cf7125aac8
cli, daemon: enhance DGX Spark support and update GPU type handling (#2496)
* feat(gpu): enhance DGX Spark support and update GPU type handling

* feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU

* feat(connector): enhance GB10 chip detection with environment variable support

* feat(gpu): enhance DGX Spark support and update GPU type handling

* feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU

* feat(connector): enhance GB10 chip detection with environment variable support

* feat: add nvidia device plugin for gb10

* fix(gpu): update pod selector for hami-device-plugin based on GB10 chip detection

fix(deploy): bump app-service image version to 0.4.78

* feat: enable CGO for building on ARM architecture and adjust build constraints for Linux

* feat: enhance multi-architecture support for ARM64 in release workflow

* feat: update multi-arch setup for ARM64 in release workflow

* feat: enhance ARM64 multi-architecture support in release workflow

* feat: streamline ARM64 cross-compilation setup in release workflow

* feat: enhance ARM64 support by adding architecture-specific package installations

* feat: update ARM64 package sources in release workflow for improved compatibility

* feat: amd device plugin and container toolkit install

* refactor: remove GB10 chip type check from GPU info update

* feat(gpu): update hami version to v2.6.10-compatible for spark

* fix: remove gb10 device plugin checking

* fix: update klauspost/cpuid to v2.3.0

* fix: amd gpu check (#2522)

* feat: enhance storage device detection with USB serial properties

* feat: update hami version to v2.6.11-compatible-arm

* feat: add chip type support for AMD and NVIDIA GPUs in node label updates

* feat(gpu): supports auto binding GPU to app

* feat(gpu): remove chip type handling from GPU label updates

* feat(gpu): remove GPU type specification from DaemonSet and values.yaml

* feat(gpu): remove GB10 device plugin installation and related checks

* feat(gpu): update HAMi to v2.6.11

---------

Co-authored-by: dkeven <dkvvven@gmail.com>
Co-authored-by: hys <hysyeah@gmail.com>
2026-02-28 11:44:02 +08:00

121 lines
2.9 KiB
Go

package amdgpu
import (
"time"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
)
// InstallAmdContainerToolkitModule installs AMD container toolkit on supported Ubuntu if ROCm is installed.
type InstallAmdContainerToolkitModule struct {
common.KubeModule
Skip bool // conditional execution based on ROCm detection
SkipRocmCheck bool
}
func (m *InstallAmdContainerToolkitModule) IsSkip() bool {
return m.Skip
}
func (m *InstallAmdContainerToolkitModule) Init() {
m.Name = "InstallAmdContainerToolkit"
if m.IsSkip() {
return
}
prepareCollection := prepare.PrepareCollection{}
if !m.SkipRocmCheck {
prepareCollection = append(prepareCollection, new(RocmInstalled))
}
updateAmdSource := &task.RemoteTask{
Name: "UpdateAmdContainerToolkitSource",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: new(UpdateAmdContainerToolkitSource),
Prepare: &prepareCollection,
Parallel: false,
Retry: 1,
}
installAmdContainerToolkit := &task.RemoteTask{
Name: "InstallAmdContainerToolkit",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(InstallAmdContainerToolkit),
Parallel: false,
Retry: 1,
}
generateAndValidateCDI := &task.RemoteTask{
Name: "GenerateAndValidateAmdCDI",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(GenerateAndValidateAmdCDI),
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
updateAmdSource,
installAmdContainerToolkit,
generateAndValidateCDI,
}
}
// InstallAmdPluginModule installs AMD GPU device plugin on Kubernetes.
type InstallAmdPluginModule struct {
common.KubeModule
Skip bool // conditional execution based on GPU enablement
}
func (m *InstallAmdPluginModule) IsSkip() bool {
return m.Skip
}
func (m *InstallAmdPluginModule) Init() {
m.Name = "InstallAmdPlugin"
// update node with AMD GPU labels
updateNode := &task.RemoteTask{
Name: "UpdateNodeAmdGPUInfo",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(UpdateNodeAmdGPUInfo),
Parallel: false,
Retry: 1,
}
installPlugin := &task.RemoteTask{
Name: "InstallAmdPlugin",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(InstallAmdPlugin),
Parallel: false,
Retry: 1,
}
checkGpuState := &task.RemoteTask{
Name: "CheckAmdGPUState",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(RocmInstalled),
},
Action: new(CheckAmdGpuStatus),
Parallel: false,
Retry: 50,
Delay: 10 * time.Second,
}
m.Tasks = []task.Interface{
updateNode,
installPlugin,
checkGpuState,
}
}