mirror of
https://github.com/beclab/Olares
synced 2026-04-21 21:47:56 +00:00
* feat(gpu): enhance DGX Spark support and update GPU type handling * feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU * feat(connector): enhance GB10 chip detection with environment variable support * feat(gpu): enhance DGX Spark support and update GPU type handling * feat(amdgpu): refactor AMD GPU detection and support for GB10 chip and APU * feat(connector): enhance GB10 chip detection with environment variable support * feat: add nvidia device plugin for gb10 * fix(gpu): update pod selector for hami-device-plugin based on GB10 chip detection fix(deploy): bump app-service image version to 0.4.78 * feat: enable CGO for building on ARM architecture and adjust build constraints for Linux * feat: enhance multi-architecture support for ARM64 in release workflow * feat: update multi-arch setup for ARM64 in release workflow * feat: enhance ARM64 multi-architecture support in release workflow * feat: streamline ARM64 cross-compilation setup in release workflow * feat: enhance ARM64 support by adding architecture-specific package installations * feat: update ARM64 package sources in release workflow for improved compatibility * feat: amd device plugin and container toolkit install * refactor: remove GB10 chip type check from GPU info update * feat(gpu): update hami version to v2.6.10-compatible for spark * fix: remove gb10 device plugin checking * fix: update klauspost/cpuid to v2.3.0 * fix: amd gpu check (#2522) * feat: enhance storage device detection with USB serial properties * feat: update hami version to v2.6.11-compatible-arm * feat: add chip type support for AMD and NVIDIA GPUs in node label updates * feat(gpu): supports auto binding GPU to app * feat(gpu): remove chip type handling from GPU label updates * feat(gpu): remove GPU type specification from DaemonSet and values.yaml * feat(gpu): remove GB10 device plugin installation and related checks * feat(gpu): update HAMi to v2.6.11 --------- Co-authored-by: dkeven <dkvvven@gmail.com> Co-authored-by: hys <hysyeah@gmail.com>
121 lines
2.9 KiB
Go
121 lines
2.9 KiB
Go
package amdgpu
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/beclab/Olares/cli/pkg/common"
|
|
"github.com/beclab/Olares/cli/pkg/core/prepare"
|
|
"github.com/beclab/Olares/cli/pkg/core/task"
|
|
)
|
|
|
|
// InstallAmdContainerToolkitModule installs AMD container toolkit on supported Ubuntu if ROCm is installed.
|
|
type InstallAmdContainerToolkitModule struct {
|
|
common.KubeModule
|
|
Skip bool // conditional execution based on ROCm detection
|
|
SkipRocmCheck bool
|
|
}
|
|
|
|
func (m *InstallAmdContainerToolkitModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *InstallAmdContainerToolkitModule) Init() {
|
|
m.Name = "InstallAmdContainerToolkit"
|
|
if m.IsSkip() {
|
|
return
|
|
}
|
|
|
|
prepareCollection := prepare.PrepareCollection{}
|
|
if !m.SkipRocmCheck {
|
|
prepareCollection = append(prepareCollection, new(RocmInstalled))
|
|
}
|
|
|
|
updateAmdSource := &task.RemoteTask{
|
|
Name: "UpdateAmdContainerToolkitSource",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Action: new(UpdateAmdContainerToolkitSource),
|
|
Prepare: &prepareCollection,
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
installAmdContainerToolkit := &task.RemoteTask{
|
|
Name: "InstallAmdContainerToolkit",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepareCollection,
|
|
Action: new(InstallAmdContainerToolkit),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
generateAndValidateCDI := &task.RemoteTask{
|
|
Name: "GenerateAndValidateAmdCDI",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepareCollection,
|
|
Action: new(GenerateAndValidateAmdCDI),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
updateAmdSource,
|
|
installAmdContainerToolkit,
|
|
generateAndValidateCDI,
|
|
}
|
|
}
|
|
|
|
// InstallAmdPluginModule installs AMD GPU device plugin on Kubernetes.
|
|
type InstallAmdPluginModule struct {
|
|
common.KubeModule
|
|
Skip bool // conditional execution based on GPU enablement
|
|
}
|
|
|
|
func (m *InstallAmdPluginModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *InstallAmdPluginModule) Init() {
|
|
m.Name = "InstallAmdPlugin"
|
|
|
|
// update node with AMD GPU labels
|
|
updateNode := &task.RemoteTask{
|
|
Name: "UpdateNodeAmdGPUInfo",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(UpdateNodeAmdGPUInfo),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
installPlugin := &task.RemoteTask{
|
|
Name: "InstallAmdPlugin",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(InstallAmdPlugin),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
checkGpuState := &task.RemoteTask{
|
|
Name: "CheckAmdGPUState",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(RocmInstalled),
|
|
},
|
|
Action: new(CheckAmdGpuStatus),
|
|
Parallel: false,
|
|
Retry: 50,
|
|
Delay: 10 * time.Second,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
updateNode,
|
|
installPlugin,
|
|
checkGpuState,
|
|
}
|
|
}
|