TDengine/test/new_test_framework/utils/server/dnode.py
2025-05-04 22:31:35 +08:00

734 lines
27 KiB
Python

###################################################################
# Copyright (c) 2016 by TAOS Technologies, Inc.
# All rights reserved.
#
# This file is proprietary and confidential to TAOS Technologies.
# No part of this file may be reproduced, stored, transmitted,
# disclosed or used in any form or by any means other than as
# expressly provided by the written permission from Jianhui Tao
#
###################################################################
# -*- coding: utf-8 -*-
import sys
import os
import os.path
import platform
import distro
import subprocess
from time import sleep
import base64
import json
import copy
from fabric2 import Connection
from shutil import which
# self
from ..log import *
class TDDnode:
def __init__(self, index=1, level=1, disk=1):
self.index = index
self.level = level
self.disk = disk
self.dataDir = []
self.running = 0
self.deployed = 0
self.testCluster = False
self.valgrind = 0
self.asan = False
self.remoteIP = ""
self.cfgDict = {
"fqdn": "localhost",
"monitor": "0",
"maxShellConns": "30000",
"locale": "en_US.UTF-8",
"charset": "UTF-8",
"asyncLog": "0",
"mDebugFlag": "135",
"dDebugFlag": "131",
"vDebugFlag": "131",
"tqDebugFlag": "135",
"cDebugFlag": "135",
"stDebugFlag": "135",
"smaDebugFlag": "135",
"jniDebugFlag": "131",
"qDebugFlag": "131",
"rpcDebugFlag": "135",
"tmrDebugFlag": "131",
"uDebugFlag": "131",
"sDebugFlag": "131",
"wDebugFlag": "131",
"numOfLogLines": "100000000",
"statusInterval": "1",
"enableQueryHb": "1",
"supportVnodes": "1024",
"telemetryReporting": "0",
}
self.binPath = "/usr/bin/taosd"
self.execPath = os.path.join(
os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "exec.sh"
)
def init(self, path, binPath, remoteIP=""):
self.path = path
self.binPath = binPath
self.remoteIP = remoteIP
if not self.remoteIP == "":
try:
self.config = eval(self.remoteIP)
self.remote_conn = Connection(
host=self.config["host"],
port=self.config["port"],
user=self.config["user"],
connect_kwargs={"password": self.config["password"]},
)
except Exception as r:
print(r)
def setTestCluster(self, value):
self.testCluster = value
def setValgrind(self, value):
self.valgrind = value
def setAsan(self, value):
self.asan = value
# if value:
# selfPath = os.path.dirname(os.path.realpath(__file__))
# if ("community" in selfPath):
# self.execPath = os.path.abspath(self.path + "/community/test/utils/exec.sh")
# else:
# self.execPath = os.path.abspath(self.path + "/test/utils/exec.sh")
def getDataSize(self):
totalSize = 0
if self.deployed == 1:
for dirpath, dirnames, filenames in os.walk(self.dataDir):
for f in filenames:
fp = os.path.join(dirpath, f)
if not os.path.islink(fp):
totalSize = totalSize + os.path.getsize(fp)
return totalSize
def addExtraCfg(self, option, value):
self.cfgDict.update({option: value})
def remoteExec(self, updateCfgDict, execCmd):
valgrindStr = ""
if self.valgrind == 1:
valgrindStr = "-g"
remoteCfgDict = copy.deepcopy(updateCfgDict)
if "logDir" in remoteCfgDict:
del remoteCfgDict["logDir"]
if "dataDir" in remoteCfgDict:
del remoteCfgDict["dataDir"]
if "cfgDir" in remoteCfgDict:
del remoteCfgDict["cfgDir"]
remoteCfgDictStr = base64.b64encode(json.dumps(remoteCfgDict).encode()).decode()
execCmdStr = base64.b64encode(execCmd.encode()).decode()
with self.remote_conn.cd(
(self.config["path"] + sys.path[0].replace(self.path, "")).replace(
"\\", "/"
)
):
self.remote_conn.run(
"python3 ./test.py %s -d %s -e %s"
% (valgrindStr, remoteCfgDictStr, execCmdStr)
)
def deploy(self, *updatecfgDict):
# logDir
self.logDir = os.path.join(self.path, "dnode%d" % self.index, "log")
# dataDir
simPath = os.path.join(self.path, "dnode%d" % self.index)
primary = 1
if self.level == 1 and self.disk == 1:
eDir = os.path.join(simPath, "data")
self.dataDir.append(eDir)
else:
for i in range(self.level):
for j in range(self.disk):
eDir = os.path.join(simPath, f"data{i}{j}")
self.dataDir.append(f"{eDir} {i} {primary}")
if primary == 1:
primary = 0
# taos.cfg
self.cfgDir = os.path.join(self.path, "dnode%d" % self.index, "cfg")
self.cfgPath = os.path.join(self.cfgDir, "taos.cfg")
for eDir in self.dataDir:
cmd = "rm -rf " + eDir
if os.system(cmd) != 0:
tdLog.exit(cmd)
cmd = "rm -rf " + self.logDir
if os.system(cmd) != 0:
tdLog.exit(cmd)
cmd = "rm -rf " + self.cfgDir
if os.system(cmd) != 0:
tdLog.exit(cmd)
# cmd = "mkdir -p " + self.dataDir
# if os.system(cmd) != 0:
# tdLog.exit(cmd)
for eDir in self.dataDir:
os.makedirs(eDir.split(" ")[0])
# cmd = "mkdir -p " + self.logDir
# if os.system(cmd) != 0:
# tdLog.exit(cmd)
os.makedirs(self.logDir)
# cmd = "mkdir -p " + self.cfgDir
# if os.system(cmd) != 0:
# tdLog.exit(cmd)
os.makedirs(self.cfgDir)
cmd = "touch " + self.cfgPath
if os.system(cmd) != 0:
tdLog.exit(cmd)
if self.testCluster:
self.startIP()
if self.testCluster:
self.cfg("masterIp", "192.168.0.1")
self.cfg("secondIp", "192.168.0.2")
self.cfg("publicIp", "192.168.0.%d" % (self.index))
self.cfg("internalIp", "192.168.0.%d" % (self.index))
self.cfg("privateIp", "192.168.0.%d" % (self.index))
self.cfgDict["dataDir"] = self.dataDir
self.cfgDict["logDir"] = self.logDir
# self.cfg("dataDir",self.dataDir)
# self.cfg("logDir",self.logDir)
# print(updatecfgDict)
isFirstDir = 1
if bool(updatecfgDict) and updatecfgDict[0] and updatecfgDict[0][0]:
for key, value in updatecfgDict[0][0].items():
if (
key == "clientCfg"
and self.remoteIP == ""
and not platform.system().lower() == "windows"
):
continue
if value == "dataDir":
if isFirstDir:
self.cfgDict.pop("dataDir")
self.cfg(value, key)
isFirstDir = 0
else:
self.cfg(value, key)
else:
self.addExtraCfg(key, value)
if self.remoteIP == "":
for key, value in self.cfgDict.items():
if type(value) == list:
for v in value:
self.cfg(key, v)
else:
self.cfg(key, value)
else:
self.remoteExec(
self.cfgDict, "tdDnodes.deploy(%d,updateCfgDict)" % self.index
)
self.deployed = 1
tdLog.debug(
"dnode:%d is deployed and configured by %s" % (self.index, self.cfgPath)
)
def getPath(self, tool="taosd"):
return os.path.join(os.path.dirname(self.binPath), tool)
# selfPath = os.path.dirname(os.path.realpath(__file__))
# if ("community" in selfPath):
# projPath = selfPath[:selfPath.find("community")]
# else:
# projPath = selfPath[:selfPath.find("tests")]
# paths = []
# for root, dirs, files in os.walk(projPath):
# if ((tool) in files or ("%s.exe"%tool) in files):
# rootRealPath = os.path.dirname(os.path.realpath(root))
# if ("packaging" not in rootRealPath):
# paths.append(os.path.join(root, tool))
# break
# if (len(paths) == 0):
# return ""
# return paths[0]
def starttaosd(self):
# binPath = self.getPath()
if os.path.exists(self.binPath):
tdLog.info("taosd found: %s" % self.binPath)
else:
tdLog.exit("taosd not found!")
if self.deployed == 0:
tdLog.exit("dnode:%d is not deployed" % (self.index))
if self.valgrind == 0:
if platform.system().lower() == "windows":
cmd = "mintty -h never %s -c %s" % (self.binPath, self.cfgDir)
else:
if self.asan:
asanDir = "%s/asan/dnode%d.asan" % (self.path, self.index)
cmd = "nohup %s -c %s > /dev/null 2> %s & " % (
self.binPath,
self.cfgDir,
asanDir,
)
else:
cmd = "nohup %s -c %s > /dev/null 2>&1 & " % (
self.binPath,
self.cfgDir,
)
else:
valgrindCmdline = (
'valgrind --log-file="%s/../log/valgrind.log" --tool=memcheck --leak-check=full --show-reachable=no --track-origins=yes --show-leak-kinds=all -v --workaround-gcc296-bugs=yes'
% self.cfgDir
)
if platform.system().lower() == "windows":
cmd = "mintty -h never %s %s -c %s" % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
else:
cmd = "nohup %s %s -c %s 2>&1 & " % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
print(cmd)
if not self.remoteIP == "":
self.remoteExec(
self.cfgDict,
'tdDnodes.dnodes[%d].deployed=1\ntdDnodes.dnodes[%d].logDir="%%s/dnode%%d/log"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.dnodes[%d].cfgDir="%%s/dnode%%d/cfg"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.start(%d)'
% (
self.index - 1,
self.index - 1,
self.index - 1,
self.index,
self.index - 1,
self.index - 1,
self.index,
self.index,
),
)
self.running = 1
else:
if os.system(cmd) != 0:
tdLog.exit(cmd)
self.running = 1
tdLog.debug("dnode:%d is running with %s " % (self.index, cmd))
if self.valgrind == 0:
time.sleep(0.1)
key1 = "from offline to online"
bkey1 = bytes(key1, encoding="utf8")
key2 = "TDengine initialized successfully"
bkey2 = bytes(key2, encoding="utf8")
logFile = self.logDir + "/taosdlog.0"
i = 0
# while not os.path.exists(logFile):
# sleep(0.1)
# i += 1
# if i > 10:
# break
# tailCmdStr = 'tail -f '
# if platform.system().lower() == 'windows':
# tailCmdStr = 'tail -n +0 -f '
# popen = subprocess.Popen(
# tailCmdStr + logFile,
# stdout=subprocess.PIPE,
# stderr=subprocess.PIPE,
# shell=True)
# pid = popen.pid
# # print('Popen.pid:' + str(pid))
# timeout = time.time() + 60 * 2
# while True:
# line = popen.stdout.readline().strip()
# print(line)
# if bkey1 in line:
# popen.kill()
# break
# elif bkey2 in line:
# popen.kill()
# break
# if time.time() > timeout:
# print(time.time(),timeout)
# tdLog.exit('wait too long for taosd start')
tdLog.debug("the dnode:%d has been started." % (self.index))
else:
tdLog.debug("wait 10 seconds for the dnode:%d to start." % (self.index))
time.sleep(10)
def start(self):
# binPath = self.getPath()
if os.path.exists(self.binPath):
tdLog.info("taosd found: %s" % self.binPath)
else:
tdLog.exit("taosd not found!")
if self.deployed == 0:
tdLog.exit("dnode:%d is not deployed" % (self.index))
if self.valgrind == 0:
if platform.system().lower() == "windows":
cmd = "mintty -h never %s -c %s" % (self.binPath, self.cfgDir)
else:
if self.asan:
asanDir = "%s/asan/dnode%d.asan" % (self.path, self.index)
cmd = "nohup %s -c %s > /dev/null 2> %s & " % (
self.binPath,
self.cfgDir,
asanDir,
)
else:
cmd = "nohup %s -c %s > /dev/null 2>&1 & " % (
self.binPath,
self.cfgDir,
)
else:
valgrindCmdline = (
'valgrind --log-file="%s/../log/valgrind.log" --tool=memcheck --leak-check=full --show-reachable=no --track-origins=yes --show-leak-kinds=all -v --workaround-gcc296-bugs=yes'
% self.cfgDir
)
if platform.system().lower() == "windows":
cmd = "mintty -h never %s %s -c %s" % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
else:
cmd = "nohup %s %s -c %s 2>&1 & " % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
print(cmd)
if not self.remoteIP == "":
self.remoteExec(
self.cfgDict,
'tdDnodes.dnodes[%d].deployed=1\ntdDnodes.dnodes[%d].logDir="%%s/dnode%%d/log"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.dnodes[%d].cfgDir="%%s/dnode%%d/cfg"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.start(%d)'
% (
self.index - 1,
self.index - 1,
self.index - 1,
self.index,
self.index - 1,
self.index - 1,
self.index,
self.index,
),
)
self.running = 1
else:
os.system("rm -rf %s/taosdlog.0" % self.logDir)
if os.system(cmd) != 0:
tdLog.exit(cmd)
self.running = 1
tdLog.debug("dnode:%d is running with %s " % (self.index, cmd))
if self.valgrind == 0:
time.sleep(0.1)
key = "from offline to online"
bkey = bytes(key, encoding="utf8")
logFile = self.logDir + "/taosdlog.0"
i = 0
while not os.path.exists(logFile):
sleep(0.1)
i += 1
if i > 50:
break
with open(logFile) as f:
timeout = time.time() + 10 * 2
while True:
line = f.readline().encode("utf-8")
if bkey in line:
break
if time.time() > timeout:
tdLog.exit("wait too long for taosd start")
tdLog.debug("the dnode:%d has been started." % (self.index))
else:
tdLog.debug("wait 10 seconds for the dnode:%d to start." % (self.index))
time.sleep(10)
def startWithoutSleep(self):
# binPath = self.getPath()
if os.path.exists(self.binPath):
tdLog.info("taosd found: %s" % self.binPath)
else:
tdLog.exit("taosd not found!")
if self.deployed == 0:
tdLog.exit("dnode:%d is not deployed" % (self.index))
if self.valgrind == 0:
if platform.system().lower() == "windows":
cmd = "mintty -h never %s -c %s" % (self.binPath, self.cfgDir)
else:
if self.asan:
asanDir = "%s/asan/dnode%d.asan" % (self.path, self.index)
cmd = "nohup %s -c %s > /dev/null 2> %s & " % (
self.binPath,
self.cfgDir,
asanDir,
)
else:
cmd = "nohup %s -c %s > /dev/null 2>&1 & " % (
self.binPath,
self.cfgDir,
)
else:
valgrindCmdline = (
'valgrind --log-file="%s/../log/valgrind.log" --tool=memcheck --leak-check=full --show-reachable=no --track-origins=yes --show-leak-kinds=all -v --workaround-gcc296-bugs=yes'
% self.cfgDir
)
if platform.system().lower() == "windows":
cmd = "mintty -h never %s %s -c %s" % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
else:
cmd = "nohup %s %s -c %s 2>&1 & " % (
valgrindCmdline,
self.binPath,
self.cfgDir,
)
print(cmd)
if self.remoteIP == "":
if os.system(cmd) != 0:
tdLog.exit(cmd)
else:
self.remoteExec(
self.cfgDict,
'tdDnodes.dnodes[%d].deployed=1\ntdDnodes.dnodes[%d].logDir="%%s/dnode%%d/log"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.dnodes[%d].cfgDir="%%s/dnode%%d/cfg"%%(tdDnodes.dnodes[%d].path,%d)\ntdDnodes.startWithoutSleep(%d)'
% (
self.index - 1,
self.index - 1,
self.index - 1,
self.index,
self.index - 1,
self.index - 1,
self.index,
self.index,
),
)
self.running = 1
tdLog.debug("dnode:%d is running with %s " % (self.index, cmd))
def stop(self):
if self.asan:
stopCmd = "%s -s stop -n dnode%d" % (self.execPath, self.index)
tdLog.info("execute script: " + stopCmd)
os.system(stopCmd)
return
if not self.remoteIP == "":
self.remoteExec(
self.cfgDict,
"tdDnodes.dnodes[%d].running=1\ntdDnodes.dnodes[%d].stop()"
% (self.index - 1, self.index - 1),
)
tdLog.info("stop dnode%d" % self.index)
return
if self.valgrind == 0:
toBeKilled = "taosd"
else:
toBeKilled = "valgrind.bin"
if self.running != 0:
psCmd = (
"ps -ef|grep -w %s| grep -v grep | awk '{print $2}' | xargs"
% toBeKilled
)
processID = (
subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
)
onlyKillOnceWindows = 0
while processID:
if not platform.system().lower() == "windows" or (
onlyKillOnceWindows == 0 and platform.system().lower() == "windows"
):
killCmd = "kill -INT %s > /dev/null 2>&1" % processID
if platform.system().lower() == "windows":
killCmd = "kill -INT %s > nul 2>&1" % processID
os.system(killCmd)
onlyKillOnceWindows = 1
time.sleep(1)
processID = (
subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
)
if not platform.system().lower() == "windows":
for port in range(6030, 6041):
fuserCmd = "fuser -k -n tcp %d > /dev/null" % port
os.system(fuserCmd)
if self.valgrind:
time.sleep(2)
self.running = 0
tdLog.info("dnode:%d is stopped by kill -INT" % (self.index))
def stoptaosd(self):
tdLog.info("start to stop taosd on dnode: %d " % (self.index))
# print(self.asan,self.running,self.remoteIP,self.valgrind)
if self.asan:
stopCmd = "%s -s stop -n dnode%d" % (self.execPath, self.index)
tdLog.info("execute script: " + stopCmd)
os.system(stopCmd)
return
if not self.remoteIP == "":
self.remoteExec(
self.cfgDict,
"tdDnodes.dnodes[%d].running=1\ntdDnodes.dnodes[%d].stop()"
% (self.index - 1, self.index - 1),
)
tdLog.info("stop dnode%d" % self.index)
return
if self.valgrind == 0:
toBeKilled = "taosd"
else:
toBeKilled = "valgrind.bin"
if self.running != 0:
if platform.system().lower() == "windows":
psCmd = (
"for /f %%a in ('wmic process where \"name='taosd.exe' and CommandLine like '%%dnode%d%%'\" get processId ^| xargs echo ^| awk ^'{print $2}^' ^&^& echo aa') do @(ps | grep %%a | awk '{print $1}' | xargs)"
% (self.index)
)
else:
psCmd = (
"ps -ef | grep -w %s | grep dnode%d | grep -v grep | awk '{print $2}' | xargs"
% (toBeKilled, self.index)
)
processID = subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
tdLog.info(f"psCmd:{psCmd}, processId:[{processID}]")
onlyKillOnceWindows = 0
while processID:
if not platform.system().lower() == "windows" or (
onlyKillOnceWindows == 0 and platform.system().lower() == "windows"
):
killCmd = "kill -INT %s > /dev/null 2>&1" % processID
if platform.system().lower() == "windows":
killCmd = "kill -INT %s > nul 2>&1" % processID
os.system(killCmd)
onlyKillOnceWindows = 1
# tdLog.info(f"kill cmd:{killCmd}")
time.sleep(1)
processID = (
subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
)
tdLog.info(f"killed processID:{processID}")
if self.valgrind:
time.sleep(2)
self.running = 0
tdLog.info("dnode:%d is stopped by kill -INT" % (self.index))
def forcestop(self):
tdLog.info("start to force stop taosd on dnode: %d " % (self.index))
if self.asan:
stopCmd = "%s -s stop -n dnode%d -x SIGKILL" % (self.execPath, self.index)
tdLog.info("execute script: " + stopCmd)
os.system(stopCmd)
return
if not self.remoteIP == "":
self.remoteExec(
self.cfgDict,
"tdDnodes.dnodes[%d].running=1\ntdDnodes.dnodes[%d].forcestop()"
% (self.index - 1, self.index - 1),
)
tdLog.info("force stop dnode%d" % self.index)
return
if self.valgrind == 0:
toBeKilled = "taosd"
else:
toBeKilled = "valgrind.bin"
if self.running != 0:
if platform.system().lower() == "windows":
psCmd = (
"for /f %%a in ('wmic process where \"name='taosd.exe' and CommandLine like '%%dnode%d%%'\" get processId ^| xargs echo ^| awk ^'{print $2}^' ^&^& echo aa') do @(ps | grep %%a | awk '{print $1}' | xargs)"
% (self.index)
)
else:
psCmd = (
"ps -ef | grep -w %s | grep dnode%d | grep -v grep | awk '{print $2}' | xargs"
% (toBeKilled, self.index)
)
processID = (
subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
)
tdLog.info(f"psCmd:{psCmd}, processId:[{processID}]")
onlyKillOnceWindows = 0
while processID:
if not platform.system().lower() == "windows" or (
onlyKillOnceWindows == 0 and platform.system().lower() == "windows"
):
killCmd = "kill -KILL %s > /dev/null 2>&1" % processID
if platform.system().lower() == "windows":
killCmd = "kill -KILL %s > nul 2>&1" % processID
os.system(killCmd)
onlyKillOnceWindows = 1
# tdLog.info(f"kill cmd:{killCmd}")
time.sleep(1)
processID = (
subprocess.check_output(psCmd, shell=True).decode("utf-8").strip()
)
tdLog.info(f"killed processID:{processID}")
# for port in range(6030, 6041):
# fuserCmd = "fuser -k -n tcp %d" % port
# os.system(fuserCmd)
if self.valgrind:
time.sleep(2)
self.running = 0
tdLog.info("dnode:%d is stopped by kill -KILL" % (self.index))
def startIP(self):
cmd = "sudo ifconfig lo:%d 192.168.0.%d up" % (self.index, self.index)
if os.system(cmd) != 0:
tdLog.exit(cmd)
def stopIP(self):
cmd = "sudo ifconfig lo:%d 192.168.0.%d down" % (self.index, self.index)
if os.system(cmd) != 0:
tdLog.exit(cmd)
def cfg(self, option, value):
cmd = "echo %s %s >> %s" % (option, value, self.cfgPath)
if os.system(cmd) != 0:
tdLog.exit(cmd)
def getDnodeRootDir(self, index):
dnodeRootDir = os.path.join(self.path, "psim", "dnode%d" % index)
return dnodeRootDir
def getDnodesRootDir(self):
dnodesRootDir = os.path.join(self.path, "psim")
return dnodesRootDir