TDengine/source/libs/sync/src/syncReplication.c

332 lines
11 KiB
C
Raw Normal View History

2022-02-22 03:28:15 +00:00
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
2022-02-26 18:24:50 +00:00
#include "syncReplication.h"
2022-03-14 06:05:40 +00:00
#include "syncIndexMgr.h"
2022-03-07 08:06:07 +00:00
#include "syncMessage.h"
2022-05-30 05:14:48 +00:00
#include "syncRaftCfg.h"
2022-03-14 06:05:40 +00:00
#include "syncRaftEntry.h"
2022-03-14 12:43:35 +00:00
#include "syncRaftLog.h"
2022-03-16 08:54:55 +00:00
#include "syncRaftStore.h"
#include "syncSnapshot.h"
2022-03-14 12:43:35 +00:00
#include "syncUtil.h"
2022-03-04 07:48:09 +00:00
2022-03-07 08:06:07 +00:00
// TLA+ Spec
// AppendEntries(i, j) ==
// /\ i /= j
// /\ state[i] = Leader
// /\ LET prevLogIndex == nextIndex[i][j] - 1
// prevLogTerm == IF prevLogIndex > 0 THEN
// log[i][prevLogIndex].term
// ELSE
// 0
// \* Send up to 1 entry, constrained by the end of the log.
// lastEntry == Min({Len(log[i]), nextIndex[i][j]})
// entries == SubSeq(log[i], nextIndex[i][j], lastEntry)
// IN Send([mtype |-> AppendEntriesRequest,
// mterm |-> currentTerm[i],
// mprevLogIndex |-> prevLogIndex,
// mprevLogTerm |-> prevLogTerm,
// mentries |-> entries,
// \* mlog is used as a history variable for the proof.
// \* It would not exist in a real implementation.
// mlog |-> log[i],
// mcommitIndex |-> Min({commitIndex[i], lastEntry}),
// msource |-> i,
// mdest |-> j])
// /\ UNCHANGED <<serverVars, candidateVars, leaderVars, logVars>>
2022-03-07 08:29:21 +00:00
//
2022-03-08 06:19:50 +00:00
int32_t syncNodeAppendEntriesPeers(SSyncNode* pSyncNode) {
2022-06-21 08:02:36 +00:00
ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
2022-03-14 06:05:40 +00:00
2022-03-23 09:08:07 +00:00
syncIndexMgrLog2("==syncNodeAppendEntriesPeers== pNextIndex", pSyncNode->pNextIndex);
syncIndexMgrLog2("==syncNodeAppendEntriesPeers== pMatchIndex", pSyncNode->pMatchIndex);
logStoreSimpleLog2("==syncNodeAppendEntriesPeers==", pSyncNode->pLogStore);
2022-03-08 06:19:50 +00:00
int32_t ret = 0;
2022-03-14 06:05:40 +00:00
for (int i = 0; i < pSyncNode->peersNum; ++i) {
2022-03-16 08:54:55 +00:00
SRaftId* pDestId = &(pSyncNode->peersId[i]);
2022-03-14 12:43:35 +00:00
2022-03-16 08:54:55 +00:00
// set prevLogIndex
SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);
2022-06-01 13:23:39 +00:00
2022-03-14 06:05:40 +00:00
SyncIndex preLogIndex = nextIndex - 1;
2022-03-14 12:43:35 +00:00
2022-03-16 08:54:55 +00:00
// set preLogTerm
2022-03-14 12:43:35 +00:00
SyncTerm preLogTerm = 0;
if (preLogIndex >= SYNC_INDEX_BEGIN) {
2022-03-14 06:05:40 +00:00
SSyncRaftEntry* pPreEntry = pSyncNode->pLogStore->getEntry(pSyncNode->pLogStore, preLogIndex);
2022-06-21 08:02:36 +00:00
ASSERT(pPreEntry != NULL);
2022-03-16 08:54:55 +00:00
2022-03-14 06:05:40 +00:00
preLogTerm = pPreEntry->term;
2022-03-16 08:54:55 +00:00
syncEntryDestory(pPreEntry);
2022-03-14 06:05:40 +00:00
}
2022-03-14 12:43:35 +00:00
2022-03-16 08:54:55 +00:00
// batch optimized
// SyncIndex lastIndex = syncUtilMinIndex(pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore), nextIndex);
SyncAppendEntries* pMsg = NULL;
SSyncRaftEntry* pEntry = pSyncNode->pLogStore->getEntry(pSyncNode->pLogStore, nextIndex);
2022-03-16 08:54:55 +00:00
if (pEntry != NULL) {
2022-04-18 13:50:56 +00:00
pMsg = syncAppendEntriesBuild(pEntry->bytes, pSyncNode->vgId);
2022-06-21 08:02:36 +00:00
ASSERT(pMsg != NULL);
2022-03-14 12:43:35 +00:00
2022-03-16 08:54:55 +00:00
// add pEntry into msg
uint32_t len;
char* serialized = syncEntrySerialize(pEntry, &len);
2022-06-21 08:02:36 +00:00
ASSERT(len == pEntry->bytes);
2022-03-16 08:54:55 +00:00
memcpy(pMsg->data, serialized, len);
2022-03-25 16:29:53 +00:00
taosMemoryFree(serialized);
2022-03-16 08:54:55 +00:00
syncEntryDestory(pEntry);
} else {
// maybe overflow, send empty record
2022-04-18 13:50:56 +00:00
pMsg = syncAppendEntriesBuild(0, pSyncNode->vgId);
2022-06-21 08:02:36 +00:00
ASSERT(pMsg != NULL);
2022-03-16 08:54:55 +00:00
}
2022-03-14 12:43:35 +00:00
2022-06-21 08:02:36 +00:00
ASSERT(pMsg != NULL);
2022-03-14 12:43:35 +00:00
pMsg->srcId = pSyncNode->myRaftId;
pMsg->destId = *pDestId;
2022-03-16 08:54:55 +00:00
pMsg->term = pSyncNode->pRaftStore->currentTerm;
2022-03-14 12:43:35 +00:00
pMsg->prevLogIndex = preLogIndex;
pMsg->prevLogTerm = preLogTerm;
pMsg->commitIndex = pSyncNode->commitIndex;
2022-03-23 09:08:07 +00:00
syncAppendEntriesLog2("==syncNodeAppendEntriesPeers==", pMsg);
2022-03-16 08:54:55 +00:00
// send AppendEntries
2022-03-14 12:43:35 +00:00
syncNodeAppendEntries(pSyncNode, pDestId, pMsg);
2022-03-16 08:54:55 +00:00
syncAppendEntriesDestroy(pMsg);
2022-03-14 06:05:40 +00:00
}
2022-03-08 06:19:50 +00:00
return ret;
}
int32_t syncNodeAppendEntriesPeersSnapshot2(SSyncNode* pSyncNode) {
if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) {
return -1;
}
int32_t ret = 0;
for (int i = 0; i < pSyncNode->peersNum; ++i) {
SRaftId* pDestId = &(pSyncNode->peersId[i]);
// next index
SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);
// pre index, pre term
SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex);
SyncTerm preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex);
if (preLogTerm == SYNC_TERM_INVALID) {
SSyncSnapshotSender* pSender = syncNodeGetSnapshotSender(pSyncNode, pDestId);
ASSERT(pSender != NULL);
ASSERT(!snapshotSenderIsStart(pSender));
SyncIndex newNextIndex = syncNodeGetLastIndex(pSyncNode) + 1;
syncIndexMgrSetIndex(pSyncNode->pNextIndex, pDestId, newNextIndex);
syncIndexMgrSetIndex(pSyncNode->pMatchIndex, pDestId, SYNC_INDEX_INVALID);
sError("vgId:%d sync get pre term error, nextIndex:%ld, update next-index:%ld, match-index:%d, raftid:%ld",
pSyncNode->vgId, nextIndex, newNextIndex, SYNC_INDEX_INVALID, pDestId->addr);
return -1;
}
SSyncRaftEntry* entryPArr[SYNC_MAX_BATCH_SIZE];
memset(entryPArr, 0, sizeof(entryPArr));
int32_t getCount = 0;
SyncIndex getEntryIndex = nextIndex;
for (int32_t i = 0; i < pSyncNode->batchSize; ++i) {
SSyncRaftEntry* pEntry;
int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, getEntryIndex, &pEntry);
if (code == 0) {
ASSERT(pEntry != NULL);
entryPArr[i] = pEntry;
getCount++;
} else {
break;
}
}
SyncAppendEntriesBatch* pMsg = syncAppendEntriesBatchBuild(entryPArr, getCount, pSyncNode->vgId);
ASSERT(pMsg != NULL);
for (int32_t i = 0; i < pSyncNode->batchSize; ++i) {
SSyncRaftEntry* pEntry = entryPArr[i];
if (pEntry != NULL) {
syncEntryDestory(pEntry);
entryPArr[i] = NULL;
}
}
// prepare msg
pMsg->srcId = pSyncNode->myRaftId;
pMsg->destId = *pDestId;
pMsg->term = pSyncNode->pRaftStore->currentTerm;
pMsg->prevLogIndex = preLogIndex;
pMsg->prevLogTerm = preLogTerm;
pMsg->commitIndex = pSyncNode->commitIndex;
pMsg->privateTerm = 0;
pMsg->dataCount = getCount;
// send msg
syncNodeAppendEntriesBatch(pSyncNode, pDestId, pMsg);
syncAppendEntriesBatchDestroy(pMsg);
}
return 0;
}
int32_t syncNodeAppendEntriesPeersSnapshot(SSyncNode* pSyncNode) {
2022-06-06 08:02:25 +00:00
ASSERT(pSyncNode->state == TAOS_SYNC_STATE_LEADER);
2022-06-06 08:02:25 +00:00
syncIndexMgrLog2("begin append entries peers pNextIndex:", pSyncNode->pNextIndex);
syncIndexMgrLog2("begin append entries peers pMatchIndex:", pSyncNode->pMatchIndex);
logStoreSimpleLog2("begin append entries peers LogStore:", pSyncNode->pLogStore);
2022-06-10 08:51:17 +00:00
if (gRaftDetailLog) {
2022-06-06 12:55:59 +00:00
SSnapshot snapshot;
pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot);
2022-06-06 12:55:59 +00:00
sTrace("begin append entries peers, snapshot.lastApplyIndex:%ld, snapshot.lastApplyTerm:%lu",
snapshot.lastApplyIndex, snapshot.lastApplyTerm);
}
int32_t ret = 0;
for (int i = 0; i < pSyncNode->peersNum; ++i) {
SRaftId* pDestId = &(pSyncNode->peersId[i]);
2022-06-06 08:02:25 +00:00
// next index
SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId);
2022-06-06 08:02:25 +00:00
// pre index, pre term
SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex);
SyncTerm preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex);
if (preLogTerm == SYNC_TERM_INVALID) {
SyncIndex newNextIndex = syncNodeGetLastIndex(pSyncNode) + 1;
syncIndexMgrSetIndex(pSyncNode->pNextIndex, pDestId, newNextIndex);
syncIndexMgrSetIndex(pSyncNode->pMatchIndex, pDestId, SYNC_INDEX_INVALID);
sError("vgId:%d sync get pre term error, nextIndex:%ld, update next-index:%ld, match-index:%d, raftid:%ld",
pSyncNode->vgId, nextIndex, newNextIndex, SYNC_INDEX_INVALID, pDestId->addr);
return -1;
}
// batch optimized
// SyncIndex lastIndex = syncUtilMinIndex(pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore), nextIndex);
2022-06-06 08:02:25 +00:00
// prepare entry
SyncAppendEntries* pMsg = NULL;
2022-06-06 08:02:25 +00:00
SSyncRaftEntry* pEntry;
int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, nextIndex, &pEntry);
2022-06-02 03:36:26 +00:00
2022-06-21 09:45:08 +00:00
if (code == 0) {
ASSERT(pEntry != NULL);
2022-06-06 08:02:25 +00:00
pMsg = syncAppendEntriesBuild(pEntry->bytes, pSyncNode->vgId);
ASSERT(pMsg != NULL);
2022-06-06 08:02:25 +00:00
// add pEntry into msg
uint32_t len;
char* serialized = syncEntrySerialize(pEntry, &len);
2022-06-21 08:02:36 +00:00
ASSERT(len == pEntry->bytes);
2022-06-06 08:02:25 +00:00
memcpy(pMsg->data, serialized, len);
2022-06-02 11:47:06 +00:00
2022-06-06 08:02:25 +00:00
taosMemoryFree(serialized);
syncEntryDestory(pEntry);
} else {
2022-06-21 09:45:08 +00:00
if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) {
// no entry in log
pMsg = syncAppendEntriesBuild(0, pSyncNode->vgId);
ASSERT(pMsg != NULL);
} else {
syncNodeLog3("", pSyncNode);
ASSERT(0);
}
}
2022-06-06 08:02:25 +00:00
// prepare msg
ASSERT(pMsg != NULL);
pMsg->srcId = pSyncNode->myRaftId;
pMsg->destId = *pDestId;
pMsg->term = pSyncNode->pRaftStore->currentTerm;
pMsg->prevLogIndex = preLogIndex;
pMsg->prevLogTerm = preLogTerm;
pMsg->commitIndex = pSyncNode->commitIndex;
2022-06-07 11:20:05 +00:00
pMsg->privateTerm = 0;
// pMsg->privateTerm = syncIndexMgrGetTerm(pSyncNode->pNextIndex, pDestId);
2022-06-06 08:02:25 +00:00
// send msg
syncNodeAppendEntries(pSyncNode, pDestId, pMsg);
syncAppendEntriesDestroy(pMsg);
}
return ret;
}
2022-05-30 05:14:48 +00:00
2022-03-08 06:19:50 +00:00
int32_t syncNodeReplicate(SSyncNode* pSyncNode) {
// start replicate
2022-05-30 05:14:48 +00:00
int32_t ret = 0;
if (pSyncNode->pRaftCfg->snapshotEnable) {
ret = syncNodeAppendEntriesPeersSnapshot(pSyncNode);
} else {
ret = syncNodeAppendEntriesPeers(pSyncNode);
}
2022-03-08 06:19:50 +00:00
return ret;
}
2022-03-07 08:06:07 +00:00
int32_t syncNodeAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, const SyncAppendEntries* pMsg) {
int32_t ret = 0;
do {
char host[128];
uint16_t port;
syncUtilU642Addr(destRaftId->addr, host, sizeof(host), &port);
2022-06-23 08:57:18 +00:00
sDebug(
2022-06-24 05:50:23 +00:00
"vgId:%d, send sync-append-entries to %s:%d, {term:%lu, pre-index:%ld, pre-term:%lu, pterm:%lu, commit:%ld, "
"datalen:%d}",
2022-06-23 08:57:18 +00:00
pSyncNode->vgId, host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->privateTerm,
pMsg->commitIndex, pMsg->dataLen);
} while (0);
2022-03-07 08:06:07 +00:00
SRpcMsg rpcMsg;
syncAppendEntries2RpcMsg(pMsg, &rpcMsg);
syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);
return ret;
}
int32_t syncNodeAppendEntriesBatch(SSyncNode* pSyncNode, const SRaftId* destRaftId,
const SyncAppendEntriesBatch* pMsg) {
do {
char host[128];
uint16_t port;
syncUtilU642Addr(destRaftId->addr, host, sizeof(host), &port);
sDebug(
"vgId:%d, send sync-append-entries-batch to %s:%d, {term:%lu, pre-index:%ld, pre-term:%lu, pterm:%lu, "
"commit:%ld, "
"datalen:%d, dataCount:%d}",
pSyncNode->vgId, host, port, pMsg->term, pMsg->prevLogIndex, pMsg->prevLogTerm, pMsg->privateTerm,
pMsg->commitIndex, pMsg->dataLen, pMsg->dataCount);
} while (0);
SRpcMsg rpcMsg;
syncAppendEntriesBatch2RpcMsg(pMsg, &rpcMsg);
syncNodeSendMsgById(destRaftId, pSyncNode, &rpcMsg);
return 0;
2022-03-07 08:06:07 +00:00
}