TDengine/test/new_test_framework/utils/clusterCommonCheck.py

532 lines
20 KiB
Python
Raw Normal View History

2025-03-28 07:01:16 +00:00
###################################################################
# Copyright (c) 2016 by TAOS Technologies, Inc.
# All rights reserved.
#
# This file is proprietary and confidential to TAOS Technologies.
# No part of this file may be reproduced, stored, transmitted,
# disclosed or used in any form or by any means other than as
# expressly provided by the written permission from Jianhui Tao
#
###################################################################
# -*- coding: utf-8 -*-
from collections import defaultdict
import random
import string
import threading
import requests
import time
import taos
2025-04-29 09:29:59 +00:00
2025-03-28 07:01:16 +00:00
from .log import *
from .sql import *
from .server.dnodes import *
from .common import *
# class actionType(Enum):
# CREATE_DATABASE = 0
# CREATE_STABLE = 1
# CREATE_CTABLE = 2
# INSERT_DATA = 3
2025-04-29 09:29:59 +00:00
2025-03-28 07:01:16 +00:00
class ClusterComCheck:
def init(self, conn, logSql=False):
tdSql.init(conn.cursor())
# tdSql.init(conn.cursor(), logSql) # output sql.txt file
2025-05-04 14:31:35 +00:00
def checkDnodes(self, dnodeNum, timeout=100):
2025-04-29 09:29:59 +00:00
count = 0
2025-03-28 07:01:16 +00:00
while count < timeout:
tdSql.query("select * from information_schema.ins_dnodes")
2025-04-29 09:29:59 +00:00
status = 0
2025-03-28 07:01:16 +00:00
for i in range(len(tdSql.queryResult)):
if tdSql.queryResult[i][4] == "ready":
2025-04-29 09:29:59 +00:00
status += 1
2025-03-28 07:01:16 +00:00
2025-05-04 14:31:35 +00:00
if status == dnodeNum:
tdLog.success(f"{dnodeNum} dnodes ready within {count}s!")
2025-03-28 07:01:16 +00:00
return True
2025-05-04 14:31:35 +00:00
else:
tdLog.info(f"{dnodeNum} dnodes not ready, {status}:{tdSql.queryRows}")
2025-04-29 09:29:59 +00:00
2025-03-28 07:01:16 +00:00
time.sleep(1)
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
else:
tdSql.query("select * from information_schema.ins_dnodes")
tdLog.debug(tdSql.queryResult)
2025-05-04 14:31:35 +00:00
tdLog.exit(f"{dnodeNum} dnodes not ready within {timeout}s!")
2025-04-30 02:35:45 +00:00
2025-04-29 09:29:59 +00:00
def checkClusterAlive(self, status, timeout=100):
count = 0
while count < timeout:
tdSql.query("show cluster alive")
2025-04-30 02:35:45 +00:00
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[0][0] == status:
tdLog.success(
2025-04-30 02:35:45 +00:00
"show cluster alive return %d within %ds!" % (status, count)
2025-04-29 09:29:59 +00:00
)
return True
time.sleep(1)
count += 1
else:
tdLog.exit(
2025-04-30 02:35:45 +00:00
"show cluster alive does not return %d within %ds!" % (status, timeout)
2025-04-29 09:29:59 +00:00
)
2025-05-05 06:08:10 +00:00
def checkDbAlive(self, dbname, status, timeout=100):
2025-04-29 09:29:59 +00:00
count = 0
2025-03-28 07:01:16 +00:00
2025-04-29 09:29:59 +00:00
while count < timeout:
tdSql.query(f"show {dbname}.alive")
2025-04-30 02:35:45 +00:00
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[0][0] == status:
tdLog.success(
2025-04-30 02:35:45 +00:00
"show %s.alive return %d within %ds!" % (dbname, status, count)
2025-04-29 09:29:59 +00:00
)
return True
time.sleep(1)
count += 1
else:
tdLog.exit(
"show %s.alive does not return %d within %ds!"
% (dbname, status, timeout)
)
2025-04-30 02:35:45 +00:00
def checkDnodeSupportVnodes(self, dnodeIndex, vnodes, timeout=100):
count = 0
while count < timeout:
tdSql.query(
f"select * from information_schema.ins_dnodes where id = {dnodeIndex}"
)
if tdSql.queryResult[0][3] == vnodes:
tdLog.success(
"dnode:%d supportVnodes==%d within %ds!"
% (dnodeIndex, vnodes, count)
)
return True
time.sleep(1)
count += 1
else:
tdLog.exit(
"dnode:%d supportVnodes!=%d does not return within %ds!"
% (dnodeIndex, vnodes, timeout)
)
def checkTransactions(self, timeout=100):
count = 0
while count < timeout:
tdSql.query(f"show transactions")
if tdSql.queryRows == 0:
tdLog.success("show transactions return 0 rows within {count}s!")
return True
if count % 5 == 0:
tdLog.info(
f"show transactions return {tdSql.queryRows} rows within {count}s!"
)
time.sleep(1)
count += 1
else:
tdLog.exit(f"show transactions not return 0 rows within {timeout}s!")
def checkDbReady(self, dbname, timeout=100):
count = 0
while count < timeout:
tdSql.query(f"show {dbname}.vgroups")
leaderNum = 0
for i in range(tdSql.queryRows):
if (
tdSql.queryResult[i][4] == "leader"
or tdSql.queryResult[i][7] == "leader"
or tdSql.queryResult[i][10] == "leader"
):
leaderNum = leaderNum + 1
tdLog.success(
f"db:{dbname} vgId:{tdSql.queryResult[i][0]} has leader within {count}s!, {tdSql.queryResult[i][3]}:{tdSql.queryResult[i][4]}, {tdSql.queryResult[i][6]}:{tdSql.queryResult[i][7]}, {tdSql.queryResult[i][9]}:{tdSql.queryResult[i][10]}"
)
else:
tdLog.info(
f"db:{dbname} vgId:{tdSql.queryResult[i][0]} no leader within {count}s!"
)
if leaderNum == tdSql.queryRows:
tdLog.info(
2025-05-05 06:08:10 +00:00
f"db:{dbname} vgroups:{tdSql.queryRows} has leader within {count}s!"
2025-04-30 02:35:45 +00:00
)
break
time.sleep(1)
count += 1
else:
tdLog.exit(f"{dbname} not ready within {timeout}s!")
2025-04-29 09:29:59 +00:00
def checkDbRows(self, dbNumbers):
dbNumbers = int(dbNumbers)
count = 0
2025-03-28 07:01:16 +00:00
while count < 5:
2025-04-29 09:29:59 +00:00
tdSql.query(
"select * from information_schema.ins_databases where name!='collectd' ;"
)
count += 1
if tdSql.checkRows(dbNumbers + 2):
tdLog.success(
"we find %d databases and expect %d in clusters! "
% (tdSql.queryRows, dbNumbers + 2)
)
2025-03-28 07:01:16 +00:00
return True
else:
continue
2025-04-29 09:29:59 +00:00
else:
2025-03-28 07:01:16 +00:00
tdLog.debug(tdSql.queryResult)
2025-04-29 09:29:59 +00:00
tdLog.exit(
"we find %d databases but expect %d in clusters! "
% (tdSql.queryRows, dbNumbers)
)
2025-03-28 07:01:16 +00:00
2025-04-29 09:29:59 +00:00
def checkDb(self, dbNumbers, restartNumber, dbNameIndex, timeout=100):
count = 0
alldbNumbers = (dbNumbers * restartNumber) + 2
2025-03-28 07:01:16 +00:00
while count < timeout:
2025-04-29 09:29:59 +00:00
query_status = 0
2025-03-28 07:01:16 +00:00
for j in range(dbNumbers):
for i in range(alldbNumbers):
tdSql.query("select * from information_schema.ins_databases;")
2025-04-29 09:29:59 +00:00
if "%s_%d" % (dbNameIndex, j) == tdSql.queryResult[i][0]:
2025-03-28 07:01:16 +00:00
if tdSql.queryResult[i][15] == "ready":
2025-04-29 09:29:59 +00:00
query_status += 1
tdLog.debug(
"check %s_%d that status is ready " % (dbNameIndex, j)
)
2025-03-28 07:01:16 +00:00
else:
sleep(1)
continue
# print(query_status)
if query_status == dbNumbers:
2025-04-29 09:29:59 +00:00
tdLog.success(
" check %d database and all databases are ready within %ds! "
% (dbNumbers, count + 1)
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
else:
tdLog.debug(tdSql.queryResult)
2025-04-29 09:29:59 +00:00
tdLog.debug("query status is %d" % query_status)
tdLog.exit("database is not ready within %ds" % (timeout + 1))
2025-03-28 07:01:16 +00:00
2025-04-29 09:29:59 +00:00
def checkData(
self,
dbname,
stbname,
stableCount,
CtableCount,
rowsPerSTable,
):
tdSql.execute("use %s" % dbname)
tdSql.query("show %s.stables" % dbname)
2025-03-28 07:01:16 +00:00
tdSql.checkRows(stableCount)
2025-04-29 09:29:59 +00:00
tdSql.query("show %s.tables" % dbname)
2025-03-28 07:01:16 +00:00
tdSql.checkRows(CtableCount)
for i in range(stableCount):
2025-04-29 09:29:59 +00:00
tdSql.query("select count(*) from %s%d" % (stbname, i))
tdSql.checkData(0, 0, rowsPerSTable)
2025-03-28 07:01:16 +00:00
return
2025-05-05 06:08:10 +00:00
def checkMnodeStatus(self, mnodeNum, checkFollower=True):
2025-05-04 14:31:35 +00:00
tdLog.debug(f"check mnodes:{mnodeNum} status")
2025-04-29 09:29:59 +00:00
count = 0
2025-03-28 07:01:16 +00:00
2025-05-04 14:31:35 +00:00
while count < 30:
2025-03-28 07:01:16 +00:00
time.sleep(1)
tdSql.query("select * from information_schema.ins_mnodes;")
2025-05-04 14:31:35 +00:00
if tdSql.checkRows(mnodeNum):
tdLog.success("cluster has %d mnodes" % mnodeNum)
2025-03-28 07:01:16 +00:00
2025-05-04 14:31:35 +00:00
if mnodeNum == 1:
tdLog.info(f"{tdSql.queryResult[0][2]}")
if tdSql.queryResult[0][2] == "leader":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-05-04 14:31:35 +00:00
elif mnodeNum == 3:
tdLog.info(
f"{tdSql.queryResult[0][2]}, {tdSql.queryResult[1][2]}, {tdSql.queryResult[2][2]}"
)
if tdSql.queryResult[0][2] == "leader":
if not checkFollower:
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
return True
elif tdSql.queryResult[1][2] == "follower":
if tdSql.queryResult[2][2] == "follower":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-05-04 14:31:35 +00:00
elif tdSql.queryResult[1][2] == "leader":
if not checkFollower:
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
return True
elif tdSql.queryResult[0][2] == "follower":
if tdSql.queryResult[2][2] == "follower":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-05-04 14:31:35 +00:00
elif tdSql.queryResult[2][2] == "leader":
if not checkFollower:
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
return True
elif tdSql.queryResult[0][2] == "follower":
if tdSql.queryResult[1][2] == "follower":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-05-04 14:31:35 +00:00
elif mnodeNum == 2:
tdLog.info(f"{tdSql.queryResult[0][2]}, {tdSql.queryResult[1][2]}")
if tdSql.queryResult[0][2] == "leader":
if not checkFollower:
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-05-04 14:31:35 +00:00
elif tdSql.queryResult[1][2] == "follower":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
return True
elif tdSql.queryResult[1][2] == "leader":
if not checkFollower:
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
return True
elif tdSql.queryResult[0][2] == "follower":
tdLog.success(f"{mnodeNum} mnodes ready in {count}s")
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
else:
tdLog.debug(tdSql.queryResult)
2025-05-04 14:31:35 +00:00
tdLog.exit(f"{mnodeNum} mnodes not ready in {count}s")
2025-03-28 07:01:16 +00:00
2025-05-04 14:31:35 +00:00
def check3mnodeoff(self, offlineDnodeNo, mnodeNum=3):
2025-04-29 09:29:59 +00:00
count = 0
2025-05-04 14:31:35 +00:00
while count < 30:
2025-03-28 07:01:16 +00:00
time.sleep(1)
tdSql.query("select * from information_schema.ins_mnodes;")
2025-05-04 14:31:35 +00:00
if tdSql.checkRows(mnodeNum):
tdLog.success("cluster has %d mnodes" % mnodeNum)
2025-03-28 07:01:16 +00:00
else:
tdLog.exit("mnode number is correct")
if offlineDnodeNo == 1:
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[0][2] == "offline":
if tdSql.queryResult[1][2] == "leader":
if tdSql.queryResult[2][2] == "follower":
tdLog.success(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
elif tdSql.queryResult[1][2] == "follower":
if tdSql.queryResult[2][2] == "leader":
tdLog.debug(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
elif offlineDnodeNo == 2:
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[1][2] == "offline":
if tdSql.queryResult[0][2] == "leader":
if tdSql.queryResult[2][2] == "follower":
tdLog.debug(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
elif tdSql.queryResult[0][2] == "follower":
if tdSql.queryResult[2][2] == "leader":
tdLog.debug(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
elif offlineDnodeNo == 3:
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[2][2] == "offline":
if tdSql.queryResult[0][2] == "leader":
if tdSql.queryResult[1][2] == "follower":
tdLog.debug(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
elif tdSql.queryResult[0][2] == "follower":
if tdSql.queryResult[1][2] == "leader":
tdLog.debug(
"stop mnodes on dnode %d successfully in 10s"
% offlineDnodeNo
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
else:
tdLog.debug(tdSql.queryResult)
tdLog.exit(f"stop mnodes on dnode {offlineDnodeNo} failed in 10s ")
2025-05-04 14:31:35 +00:00
def check3mnode2off(self, mnodeNum=3):
2025-04-29 09:29:59 +00:00
count = 0
2025-05-04 14:31:35 +00:00
while count < 30:
2025-03-28 07:01:16 +00:00
time.sleep(1)
tdSql.query("select * from information_schema.ins_mnodes;")
2025-05-04 14:31:35 +00:00
if tdSql.checkRows(mnodeNum):
tdLog.success("cluster has %d mnodes" % mnodeNum)
2025-03-28 07:01:16 +00:00
else:
tdLog.exit("mnode number is correct")
2025-04-29 09:29:59 +00:00
if tdSql.queryResult[0][2] == "leader":
if tdSql.queryResult[1][2] == "offline":
if tdSql.queryResult[2][2] == "offline":
tdLog.success(
"stop mnodes of follower on dnode successfully in 10s"
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
else:
tdLog.debug(tdSql.queryResult)
tdLog.exit("stop mnodes on dnode 2 or 3 failed in 10s")
2025-04-29 09:29:59 +00:00
def check_vgroups_status_with_offline(
self, vgroup_numbers=2, db_replica=3, count_number=10, db_name="db"
):
"""
n nodes cluster, 3 replica database
return 1, n leaders, stable status
return 2, 0 < num of leader < n, stable status
return 0, no leader, stable status
return -1, Elections not yet completed, unstable status
2025-03-28 07:01:16 +00:00
"""
vgroup_numbers = int(vgroup_numbers)
self.db_replica = int(db_replica)
tdLog.debug("start to check status of vgroups")
2025-04-29 09:29:59 +00:00
count = 0
2025-03-28 07:01:16 +00:00
leader_number = 0
while count < count_number:
time.sleep(1)
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
tdSql.query(f"show {db_name}.vgroups;")
2025-04-29 09:29:59 +00:00
if tdSql.getRows() != vgroup_numbers:
2025-03-28 07:01:16 +00:00
continue
for i in range(vgroup_numbers):
print(tdSql.queryResult[i])
2025-04-29 09:29:59 +00:00
if "leader" in tdSql.queryResult[i]:
leader_number += 1
elif (
tdSql.queryResult[i].count("follower")
+ tdSql.queryResult[i].count("candidate")
>= 2
):
2025-03-28 07:01:16 +00:00
tdLog.debug("Elections not yet completed")
return -1
2025-04-29 09:29:59 +00:00
else: # only one 'follower' or 'offline'
tdLog.debug(
"Not in compliance with Raft protocol, unable to complete election"
)
if leader_number == vgroup_numbers:
2025-03-28 07:01:16 +00:00
tdLog.debug("Leader election for all vgroups completed")
return 1
elif leader_number == 0:
tdLog.debug("all vnodes is follower")
return 0
else:
2025-04-29 09:29:59 +00:00
tdLog.debug(
f"there is {vgroup_numbers} vgroups, and leader elections for {leader_number} vgroups competed"
)
2025-03-28 07:01:16 +00:00
return 2
else:
tdLog.debug(tdSql.queryResult)
2025-04-29 09:29:59 +00:00
tdLog.notice(
f"elections of {db_name} all vgroups with replica {self.db_replica} are failed in {count} s "
)
2025-03-28 07:01:16 +00:00
2025-04-29 09:29:59 +00:00
def check_vgroups_status(
self, vgroup_numbers=2, db_replica=3, count_number=10, db_name="db"
):
"""check vgroups status in 10s after db vgroups status is changed"""
2025-03-28 07:01:16 +00:00
vgroup_numbers = int(vgroup_numbers)
self.db_replica = int(db_replica)
tdLog.debug("start to check status of vgroups")
2025-04-29 09:29:59 +00:00
count = 0
last_number = vgroup_numbers - 1
2025-03-28 07:01:16 +00:00
while count < count_number:
time.sleep(1)
2025-04-29 09:29:59 +00:00
count += 1
2025-03-28 07:01:16 +00:00
print("check vgroup count :", count)
tdSql.query(f"show {db_name}.vgroups;")
2025-04-29 09:29:59 +00:00
if tdSql.getRows() != vgroup_numbers:
2025-03-28 07:01:16 +00:00
continue
2025-04-29 09:29:59 +00:00
if self.db_replica == 1:
if (
tdSql.queryResult[0][4] == "leader"
and tdSql.queryResult[last_number][4] == "leader"
):
tdSql.query(
f"select `replica` from information_schema.ins_databases where `name`='{db_name}';"
)
print("db replica :", tdSql.queryResult[0][0])
2025-03-28 07:01:16 +00:00
if tdSql.queryResult[0][0] == db_replica:
2025-04-29 09:29:59 +00:00
tdLog.success(
f"all vgroups with replica {self.db_replica} of {db_name} are leaders in {count} s"
)
2025-03-28 07:01:16 +00:00
return True
2025-04-29 09:29:59 +00:00
elif self.db_replica == 3:
vgroup_status_first = [
tdSql.queryResult[0][4],
2025-07-14 01:49:22 +00:00
tdSql.queryResult[0][7],
tdSql.queryResult[0][10],
2025-04-29 09:29:59 +00:00
]
2025-03-28 07:01:16 +00:00
2025-04-29 09:29:59 +00:00
vgroup_status_last = [
tdSql.queryResult[last_number][4],
2025-07-14 01:49:22 +00:00
tdSql.queryResult[last_number][7],
tdSql.queryResult[last_number][10],
2025-04-29 09:29:59 +00:00
]
if (
vgroup_status_first.count("leader") == 1
and vgroup_status_first.count("follower") == 2
):
if (
vgroup_status_last.count("leader") == 1
and vgroup_status_last.count("follower") == 2
):
tdSql.query(
f"select `replica` from information_schema.ins_databases where `name`='{db_name}';"
)
print("db replica :", tdSql.queryResult[0][0])
2025-03-28 07:01:16 +00:00
if tdSql.queryResult[0][0] == db_replica:
2025-04-29 09:29:59 +00:00
tdLog.success(
f"elections of {db_name}.vgroups with replica {self.db_replica} are ready in {count} s"
)
2025-03-28 07:01:16 +00:00
return True
else:
tdLog.debug(tdSql.queryResult)
2025-04-29 09:29:59 +00:00
tdLog.notice(
f"elections of {db_name} all vgroups with replica {self.db_replica} are failed in {count} s "
)
2025-03-28 07:01:16 +00:00
caller = inspect.getframeinfo(inspect.stack()[1][0])
args = (caller.filename, caller.lineno)
tdLog.exit("%s(%d) failed " % args)
def close(self):
self.cursor.close()
2025-04-29 09:29:59 +00:00
2025-03-28 07:01:16 +00:00
clusterComCheck = ClusterComCheck()