mirror of
https://github.com/beclab/Olares
synced 2026-04-21 21:47:56 +00:00
349 lines
7.4 KiB
Go
349 lines
7.4 KiB
Go
package gpu
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/beclab/Olares/cli/pkg/container"
|
|
|
|
"github.com/beclab/Olares/cli/pkg/common"
|
|
"github.com/beclab/Olares/cli/pkg/core/prepare"
|
|
"github.com/beclab/Olares/cli/pkg/core/task"
|
|
"github.com/beclab/Olares/cli/pkg/manifest"
|
|
)
|
|
|
|
type InstallDriversModule struct {
|
|
common.KubeModule
|
|
manifest.ManifestModule
|
|
Skip bool // enableGPU && ubuntuVersionSupport
|
|
|
|
// log a failure message and then exit
|
|
// instead of silently skip the jobs when:
|
|
// 1. no card is found (which skips the driver installation)
|
|
// 2. no driver is found (which skips the container toolkit installation)
|
|
FailOnNoInstallation bool
|
|
}
|
|
|
|
func (m *InstallDriversModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *InstallDriversModule) Init() {
|
|
m.Name = "InstallGPUDriver"
|
|
|
|
installCudaDriver := &task.RemoteTask{ // not for WSL
|
|
Name: "InstallNvidiaDriver",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(CudaNotInstalled),
|
|
&NvidiaGraphicsCard{ExitOnNotFound: m.FailOnNoInstallation},
|
|
},
|
|
Action: &InstallCudaDriver{
|
|
ManifestAction: manifest.ManifestAction{
|
|
Manifest: m.Manifest,
|
|
BaseDir: m.BaseDir,
|
|
},
|
|
},
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
installCudaDriver,
|
|
}
|
|
}
|
|
|
|
type InstallContainerToolkitModule struct {
|
|
common.KubeModule
|
|
manifest.ManifestModule
|
|
Skip bool // enableGPU && ubuntuVersionSupport
|
|
SkipCudaCheck bool
|
|
}
|
|
|
|
func (m *InstallContainerToolkitModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *InstallContainerToolkitModule) Init() {
|
|
m.Name = "InstallContainerToolkit"
|
|
prepareCollection := prepare.PrepareCollection{
|
|
new(ContainerdInstalled),
|
|
}
|
|
if !m.SkipCudaCheck {
|
|
prepareCollection = append(prepareCollection, new(CudaInstalled))
|
|
}
|
|
|
|
updateCudaSource := &task.RemoteTask{
|
|
Name: "UpdateNvidiaToolkitSource",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Action: &UpdateNvidiaContainerToolkitSource{
|
|
ManifestAction: manifest.ManifestAction{
|
|
Manifest: m.Manifest,
|
|
BaseDir: m.BaseDir,
|
|
},
|
|
},
|
|
Prepare: &prepareCollection,
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
installNvidiaContainerToolkit := &task.RemoteTask{
|
|
Name: "InstallNvidiaToolkit",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepareCollection,
|
|
Action: new(InstallNvidiaContainerToolkit),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
configureContainerdRuntime := &task.RemoteTask{
|
|
Name: "ConfigureContainerdRuntime",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepareCollection,
|
|
Action: new(ConfigureContainerdRuntime),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
updateCudaSource,
|
|
installNvidiaContainerToolkit,
|
|
configureContainerdRuntime,
|
|
}
|
|
|
|
}
|
|
|
|
type RestartK3sServiceModule struct {
|
|
common.KubeModule
|
|
Skip bool // enableGPU && ubuntuVersionSupport
|
|
}
|
|
|
|
func (m *RestartK3sServiceModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *RestartK3sServiceModule) Init() {
|
|
m.Name = "RestartK3sService"
|
|
|
|
patchK3sDriver := &task.RemoteTask{
|
|
Name: "PatchK3sDriver",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(PatchK3sDriver),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
patchK3sDriver,
|
|
}
|
|
}
|
|
|
|
type RestartContainerdModule struct {
|
|
common.KubeModule
|
|
Skip bool // enableGPU && ubuntuVersionSupport
|
|
}
|
|
|
|
func (m *RestartContainerdModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *RestartContainerdModule) Init() {
|
|
m.Name = "RestartContainerd"
|
|
|
|
restartContainerd := &task.RemoteTask{
|
|
Name: "RestartContainerd",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(ContainerdInstalled),
|
|
},
|
|
Action: new(container.RestartContainerd),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
restartContainerd,
|
|
}
|
|
}
|
|
|
|
type InstallPluginModule struct {
|
|
common.KubeModule
|
|
Skip bool // enableGPU && ubuntuVersionSupport
|
|
}
|
|
|
|
func (m *InstallPluginModule) IsSkip() bool {
|
|
return m.Skip
|
|
}
|
|
|
|
func (m *InstallPluginModule) Init() {
|
|
m.Name = "InstallPlugin"
|
|
|
|
// update node with gpu labels, to make plugins enabled
|
|
updateNode := &task.RemoteTask{
|
|
Name: "UpdateNode",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(UpdateNodeGPUInfo),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
installPlugin := &task.RemoteTask{
|
|
Name: "InstallPlugin",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(InstallPlugin),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
checkGpuState := &task.RemoteTask{
|
|
Name: "CheckGPUState",
|
|
Hosts: m.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(CudaInstalled),
|
|
},
|
|
Action: new(CheckGpuStatus),
|
|
Parallel: false,
|
|
Retry: 50,
|
|
Delay: 10 * time.Second,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
updateNode,
|
|
installPlugin,
|
|
checkGpuState,
|
|
}
|
|
}
|
|
|
|
type NodeLabelingModule struct {
|
|
common.KubeModule
|
|
}
|
|
|
|
func (l *NodeLabelingModule) Init() {
|
|
l.Name = "NodeLabeling"
|
|
|
|
updateNode := &task.LocalTask{
|
|
Name: "UpdateNode",
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(CudaInstalled),
|
|
new(CurrentNodeInK8s),
|
|
},
|
|
Action: new(UpdateNodeGPUInfo),
|
|
Retry: 1,
|
|
}
|
|
|
|
restartPlugin := &task.LocalTask{
|
|
Name: "RestartPlugin",
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(CudaInstalled),
|
|
new(CurrentNodeInK8s),
|
|
},
|
|
Action: new(RestartPlugin),
|
|
Retry: 1,
|
|
}
|
|
|
|
l.Tasks = []task.Interface{
|
|
updateNode,
|
|
restartPlugin,
|
|
}
|
|
}
|
|
|
|
type NodeUnlabelingModule struct {
|
|
common.KubeModule
|
|
}
|
|
|
|
func (l *NodeUnlabelingModule) Init() {
|
|
l.Name = "NodeUnlabeling"
|
|
|
|
removeNode := &task.RemoteTask{
|
|
Name: "RemoveNodeLabels",
|
|
Hosts: l.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(CurrentNodeInK8s),
|
|
},
|
|
Action: new(RemoveNodeLabels),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
restartPlugin := &task.RemoteTask{
|
|
Name: "RestartPlugin",
|
|
Hosts: l.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(CurrentNodeInK8s),
|
|
new(GpuDevicePluginInstalled),
|
|
},
|
|
Action: new(RestartPlugin),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
l.Tasks = []task.Interface{
|
|
removeNode,
|
|
restartPlugin,
|
|
}
|
|
}
|
|
|
|
type UninstallCudaModule struct {
|
|
common.KubeModule
|
|
}
|
|
|
|
func (l *UninstallCudaModule) Init() {
|
|
l.Name = "UninstallCuda"
|
|
|
|
uninstallCuda := &task.RemoteTask{
|
|
Name: "UninstallCuda",
|
|
Hosts: l.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
},
|
|
Action: new(UninstallNvidiaDrivers),
|
|
Parallel: false,
|
|
Retry: 1,
|
|
}
|
|
|
|
removeRuntime := &task.RemoteTask{
|
|
Name: "RemoveRuntime",
|
|
Hosts: l.Runtime.GetHostsByRole(common.Master),
|
|
Prepare: &prepare.PrepareCollection{
|
|
new(common.OnlyFirstMaster),
|
|
new(ContainerdInstalled),
|
|
},
|
|
Action: new(RemoveContainerRuntimeConfig),
|
|
}
|
|
|
|
l.Tasks = []task.Interface{
|
|
uninstallCuda,
|
|
removeRuntime,
|
|
}
|
|
|
|
}
|
|
|
|
type DisableNouveauModule struct {
|
|
common.KubeModule
|
|
}
|
|
|
|
func (m *DisableNouveauModule) Init() {
|
|
m.Name = "DisableNouveau"
|
|
|
|
writeBlacklist := &task.LocalTask{
|
|
Name: "WriteNouveauBlacklist",
|
|
Action: new(WriteNouveauBlacklist),
|
|
Retry: 1,
|
|
}
|
|
|
|
m.Tasks = []task.Interface{
|
|
writeBlacklist,
|
|
}
|
|
}
|