Olares/cli/pkg/gpu/module.go

349 lines
7.4 KiB
Go

package gpu
import (
"time"
"github.com/beclab/Olares/cli/pkg/container"
"github.com/beclab/Olares/cli/pkg/common"
"github.com/beclab/Olares/cli/pkg/core/prepare"
"github.com/beclab/Olares/cli/pkg/core/task"
"github.com/beclab/Olares/cli/pkg/manifest"
)
type InstallDriversModule struct {
common.KubeModule
manifest.ManifestModule
Skip bool // enableGPU && ubuntuVersionSupport
// log a failure message and then exit
// instead of silently skip the jobs when:
// 1. no card is found (which skips the driver installation)
// 2. no driver is found (which skips the container toolkit installation)
FailOnNoInstallation bool
}
func (m *InstallDriversModule) IsSkip() bool {
return m.Skip
}
func (m *InstallDriversModule) Init() {
m.Name = "InstallGPUDriver"
installCudaDriver := &task.RemoteTask{ // not for WSL
Name: "InstallNvidiaDriver",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(CudaNotInstalled),
&NvidiaGraphicsCard{ExitOnNotFound: m.FailOnNoInstallation},
},
Action: &InstallCudaDriver{
ManifestAction: manifest.ManifestAction{
Manifest: m.Manifest,
BaseDir: m.BaseDir,
},
},
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
installCudaDriver,
}
}
type InstallContainerToolkitModule struct {
common.KubeModule
manifest.ManifestModule
Skip bool // enableGPU && ubuntuVersionSupport
SkipCudaCheck bool
}
func (m *InstallContainerToolkitModule) IsSkip() bool {
return m.Skip
}
func (m *InstallContainerToolkitModule) Init() {
m.Name = "InstallContainerToolkit"
prepareCollection := prepare.PrepareCollection{
new(ContainerdInstalled),
}
if !m.SkipCudaCheck {
prepareCollection = append(prepareCollection, new(CudaInstalled))
}
updateCudaSource := &task.RemoteTask{
Name: "UpdateNvidiaToolkitSource",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Action: &UpdateNvidiaContainerToolkitSource{
ManifestAction: manifest.ManifestAction{
Manifest: m.Manifest,
BaseDir: m.BaseDir,
},
},
Prepare: &prepareCollection,
Parallel: false,
Retry: 1,
}
installNvidiaContainerToolkit := &task.RemoteTask{
Name: "InstallNvidiaToolkit",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(InstallNvidiaContainerToolkit),
Parallel: false,
Retry: 1,
}
configureContainerdRuntime := &task.RemoteTask{
Name: "ConfigureContainerdRuntime",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepareCollection,
Action: new(ConfigureContainerdRuntime),
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
updateCudaSource,
installNvidiaContainerToolkit,
configureContainerdRuntime,
}
}
type RestartK3sServiceModule struct {
common.KubeModule
Skip bool // enableGPU && ubuntuVersionSupport
}
func (m *RestartK3sServiceModule) IsSkip() bool {
return m.Skip
}
func (m *RestartK3sServiceModule) Init() {
m.Name = "RestartK3sService"
patchK3sDriver := &task.RemoteTask{
Name: "PatchK3sDriver",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(PatchK3sDriver),
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
patchK3sDriver,
}
}
type RestartContainerdModule struct {
common.KubeModule
Skip bool // enableGPU && ubuntuVersionSupport
}
func (m *RestartContainerdModule) IsSkip() bool {
return m.Skip
}
func (m *RestartContainerdModule) Init() {
m.Name = "RestartContainerd"
restartContainerd := &task.RemoteTask{
Name: "RestartContainerd",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(ContainerdInstalled),
},
Action: new(container.RestartContainerd),
Parallel: false,
Retry: 1,
}
m.Tasks = []task.Interface{
restartContainerd,
}
}
type InstallPluginModule struct {
common.KubeModule
Skip bool // enableGPU && ubuntuVersionSupport
}
func (m *InstallPluginModule) IsSkip() bool {
return m.Skip
}
func (m *InstallPluginModule) Init() {
m.Name = "InstallPlugin"
// update node with gpu labels, to make plugins enabled
updateNode := &task.RemoteTask{
Name: "UpdateNode",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(UpdateNodeGPUInfo),
Parallel: false,
Retry: 1,
}
installPlugin := &task.RemoteTask{
Name: "InstallPlugin",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(InstallPlugin),
Parallel: false,
Retry: 1,
}
checkGpuState := &task.RemoteTask{
Name: "CheckGPUState",
Hosts: m.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(CudaInstalled),
},
Action: new(CheckGpuStatus),
Parallel: false,
Retry: 50,
Delay: 10 * time.Second,
}
m.Tasks = []task.Interface{
updateNode,
installPlugin,
checkGpuState,
}
}
type NodeLabelingModule struct {
common.KubeModule
}
func (l *NodeLabelingModule) Init() {
l.Name = "NodeLabeling"
updateNode := &task.LocalTask{
Name: "UpdateNode",
Prepare: &prepare.PrepareCollection{
new(CudaInstalled),
new(CurrentNodeInK8s),
},
Action: new(UpdateNodeGPUInfo),
Retry: 1,
}
restartPlugin := &task.LocalTask{
Name: "RestartPlugin",
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(CudaInstalled),
new(CurrentNodeInK8s),
},
Action: new(RestartPlugin),
Retry: 1,
}
l.Tasks = []task.Interface{
updateNode,
restartPlugin,
}
}
type NodeUnlabelingModule struct {
common.KubeModule
}
func (l *NodeUnlabelingModule) Init() {
l.Name = "NodeUnlabeling"
removeNode := &task.RemoteTask{
Name: "RemoveNodeLabels",
Hosts: l.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(CurrentNodeInK8s),
},
Action: new(RemoveNodeLabels),
Parallel: false,
Retry: 1,
}
restartPlugin := &task.RemoteTask{
Name: "RestartPlugin",
Hosts: l.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(CurrentNodeInK8s),
new(GpuDevicePluginInstalled),
},
Action: new(RestartPlugin),
Parallel: false,
Retry: 1,
}
l.Tasks = []task.Interface{
removeNode,
restartPlugin,
}
}
type UninstallCudaModule struct {
common.KubeModule
}
func (l *UninstallCudaModule) Init() {
l.Name = "UninstallCuda"
uninstallCuda := &task.RemoteTask{
Name: "UninstallCuda",
Hosts: l.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
},
Action: new(UninstallNvidiaDrivers),
Parallel: false,
Retry: 1,
}
removeRuntime := &task.RemoteTask{
Name: "RemoveRuntime",
Hosts: l.Runtime.GetHostsByRole(common.Master),
Prepare: &prepare.PrepareCollection{
new(common.OnlyFirstMaster),
new(ContainerdInstalled),
},
Action: new(RemoveContainerRuntimeConfig),
}
l.Tasks = []task.Interface{
uninstallCuda,
removeRuntime,
}
}
type DisableNouveauModule struct {
common.KubeModule
}
func (m *DisableNouveauModule) Init() {
m.Name = "DisableNouveau"
writeBlacklist := &task.LocalTask{
Name: "WriteNouveauBlacklist",
Action: new(WriteNouveauBlacklist),
Retry: 1,
}
m.Tasks = []task.Interface{
writeBlacklist,
}
}