/* * Copyright (c) 2019 TAOS Data, Inc. * * This program is free software: you can use, redistribute, and/or modify * it under the terms of the GNU Affero General Public License, version 3 * or later ("AGPL"), as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "tstreamFileState.h" #include "query.h" #include "streamBackendRocksdb.h" #include "taos.h" #include "tcommon.h" #include "tcompare.h" #include "thash.h" #include "tsimplehash.h" #define FLUSH_RATIO 0.5 #define FLUSH_NUM 4 #define DEFAULT_MAX_STREAM_BUFFER_SIZE (128 * 1024 * 1024) #define MIN_NUM_OF_ROW_BUFF 10240 #define MIN_NUM_OF_RECOVER_ROW_BUFF 128 #define MIN_NUM_SEARCH_BUCKET 128 #define MAX_ARRAY_SIZE 1024 #define MAX_GROUP_ID_NUM 200000 #define NUM_OF_CACHE_WIN 64 #define MAX_NUM_OF_CACHE_WIN 128 #define MIN_NUM_OF_SORT_CACHE_WIN 40960 #define BATCH_LIMIT 256 #define DEFAULT_STATE_MAP_CAPACITY 10240 #define MAX_STATE_MAP_SIZE 10240000 #define SET_TSDATA_FLAG(ptr, len) ((*(char*)POINTER_SHIFT(ptr, (len - 1))) |= 1) #define UNSET_TSDATA_FLAG(ptr, len) ((*(char*)POINTER_SHIFT(ptr, (len - 1))) &= 0) #define HAS_TSDATA_FLAG(ptr, len) ((*(char*)POINTER_SHIFT(ptr, (len - 1))) & 1) #define TASK_KEY "streamFileState" #define STREAM_STATE_INFO_NAME "StreamStateCheckPoint" struct SStreamFileState { SList* usedBuffs; SList* freeBuffs; void* rowStateBuff; void* pFileStore; int32_t rowSize; int32_t selectivityRowSize; int32_t keyLen; uint64_t preCheckPointVersion; uint64_t checkPointVersion; TSKEY maxTs; TSKEY deleteMark; TSKEY flushMark; uint64_t maxRowCount; uint64_t curRowCount; GetTsFun getTs; char* id; char* cfName; void* searchBuff; SSHashObj* pGroupIdMap; bool hasFillCatch; SSHashObj* pRecFlagMap; _state_buff_cleanup_fn stateBuffCleanupFn; _state_buff_remove_fn stateBuffRemoveFn; _state_buff_remove_by_pos_fn stateBuffRemoveByPosFn; _state_buff_create_statekey_fn stateBuffCreateStateKeyFn; _state_file_remove_fn stateFileRemoveFn; _state_file_get_fn stateFileGetFn; _state_fun_get_fn stateFunctionGetFn; }; typedef SRowBuffPos SRowBuffInfo; int fillStateKeyCompare(const void* pWin1, const void* pDatas, int pos) { SWinKey* pWin2 = taosArrayGet(pDatas, pos); return winKeyCmprImpl((SWinKey*)pWin1, pWin2); } int fillTSKeyCompare(const void* pKey1, const void* pDatas, int pos) { SWinKey* pWin1 = (SWinKey*)pKey1; SWinKey* pWin2 = taosArrayGet(pDatas, pos); if (pWin1->ts > pWin2->ts) { return 1; } else if (pWin1->ts < pWin2->ts) { return -1; } return 0; } int32_t stateHashBuffRemoveFn(void* pBuff, const void* pKey, size_t keyLen) { SRowBuffPos** pos = tSimpleHashGet(pBuff, pKey, keyLen); if (pos) { (*pos)->beFlushed = true; (*pos)->invalid = true; } return tSimpleHashRemove(pBuff, pKey, keyLen); } void stateHashBuffRemoveByPosFn(SStreamFileState* pFileState, SRowBuffPos* pPos) { size_t keyLen = pFileState->keyLen; SRowBuffPos** ppPos = tSimpleHashGet(pFileState->rowStateBuff, pPos->pKey, keyLen); if (ppPos) { if ((*ppPos) == pPos) { int32_t tmpRes = tSimpleHashRemove(pFileState->rowStateBuff, pPos->pKey, keyLen); qTrace("%s at line %d res:%d", __func__, __LINE__, tmpRes); } } } void stateHashBuffClearFn(void* pBuff) { tSimpleHashClear(pBuff); } void stateHashBuffCleanupFn(void* pBuff) { tSimpleHashCleanup(pBuff); } int32_t intervalFileRemoveFn(SStreamFileState* pFileState, const void* pKey) { return streamStateDel_rocksdb(pFileState->pFileStore, pKey); } int32_t intervalFileGetFn(SStreamFileState* pFileState, void* pKey, void** data, int32_t* pDataLen) { return streamStateGet_rocksdb(pFileState->pFileStore, pKey, data, pDataLen); } void* intervalCreateStateKey(SRowBuffPos* pPos, int64_t num) { SStateKey* pStateKey = taosMemoryCalloc(1, sizeof(SStateKey)); if (pStateKey == NULL) { qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); return NULL; } SWinKey* pWinKey = pPos->pKey; pStateKey->key = *pWinKey; pStateKey->opNum = num; return pStateKey; } void* defaultCreateStateKey(SRowBuffPos* pPos, int64_t num) { SWinKey* pStateKey = taosMemoryCalloc(1, sizeof(SWinKey)); if (pStateKey == NULL) { qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); return NULL; } SWinKey* pWinKey = pPos->pKey; *pStateKey = *pWinKey; return pStateKey; } int32_t sessionFileRemoveFn(SStreamFileState* pFileState, const void* pKey) { return streamStateSessionDel_rocksdb(pFileState->pFileStore, pKey); } int32_t sessionFileGetFn(SStreamFileState* pFileState, void* pKey, void** data, int32_t* pDataLen) { return streamStateSessionGet_rocksdb(pFileState->pFileStore, pKey, data, pDataLen); } void* sessionCreateStateKey(SRowBuffPos* pPos, int64_t num) { SStateSessionKey* pStateKey = taosMemoryCalloc(1, sizeof(SStateSessionKey)); if (pStateKey == NULL) { qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); return NULL; } SSessionKey* pWinKey = pPos->pKey; pStateKey->key = *pWinKey; pStateKey->opNum = num; return pStateKey; } static void streamFileStateDecode(TSKEY* pKey, void* pBuff, int32_t len) { pBuff = taosDecodeFixedI64(pBuff, pKey); } static int32_t streamFileStateEncode(TSKEY* pKey, void** pVal, int32_t* pLen) { *pLen = sizeof(TSKEY); (*pVal) = taosMemoryCalloc(1, *pLen); if ((*pVal) == NULL) { qError("%s failed at line %d since %s", __func__, __LINE__, tstrerror(terrno)); return terrno; } void* buff = *pVal; int32_t tmp = taosEncodeFixedI64(&buff, *pKey); return TSDB_CODE_SUCCESS; } int32_t streamFileStateInit(int64_t memSize, uint32_t keySize, uint32_t rowSize, uint32_t selectRowSize, GetTsFun fp, void* pFile, TSKEY delMark, const char* taskId, int64_t checkpointId, int8_t type, SStreamFileState** ppFileState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (memSize <= 0) { memSize = DEFAULT_MAX_STREAM_BUFFER_SIZE; } if (rowSize == 0) { code = TSDB_CODE_INVALID_PARA; QUERY_CHECK_CODE(code, lino, _end); } SStreamFileState* pFileState = taosMemoryCalloc(1, sizeof(SStreamFileState)); QUERY_CHECK_NULL(pFileState, code, lino, _end, terrno); rowSize += selectRowSize; pFileState->maxRowCount = TMAX((uint64_t)memSize / rowSize, FLUSH_NUM * 2); pFileState->usedBuffs = tdListNew(POINTER_BYTES); QUERY_CHECK_NULL(pFileState->usedBuffs, code, lino, _end, terrno); pFileState->freeBuffs = tdListNew(POINTER_BYTES); QUERY_CHECK_NULL(pFileState->freeBuffs, code, lino, _end, terrno); _hash_fn_t hashFn = taosGetDefaultHashFunction(TSDB_DATA_TYPE_BINARY); int32_t cap = TMIN(MIN_NUM_OF_ROW_BUFF, pFileState->maxRowCount); if (type == STREAM_STATE_BUFF_HASH || type == STREAM_STATE_BUFF_HASH_SEARCH) { pFileState->rowStateBuff = tSimpleHashInit(cap, hashFn); pFileState->stateBuffCleanupFn = stateHashBuffCleanupFn; pFileState->stateBuffRemoveFn = stateHashBuffRemoveFn; pFileState->stateBuffRemoveByPosFn = stateHashBuffRemoveByPosFn; pFileState->stateBuffCreateStateKeyFn = intervalCreateStateKey; pFileState->stateFileRemoveFn = intervalFileRemoveFn; pFileState->stateFileGetFn = intervalFileGetFn; pFileState->cfName = taosStrdup("state"); pFileState->stateFunctionGetFn = addRowBuffIfNotExist; } else if (type == STREAM_STATE_BUFF_SORT) { pFileState->rowStateBuff = tSimpleHashInit(cap, hashFn); pFileState->stateBuffCleanupFn = sessionWinStateCleanup; pFileState->stateBuffRemoveFn = deleteSessionWinStateBuffFn; pFileState->stateBuffRemoveByPosFn = deleteSessionWinStateBuffByPosFn; pFileState->stateBuffCreateStateKeyFn = sessionCreateStateKey; pFileState->stateFileRemoveFn = sessionFileRemoveFn; pFileState->stateFileGetFn = sessionFileGetFn; pFileState->cfName = taosStrdup("sess"); pFileState->stateFunctionGetFn = getSessionRowBuff; } else if (type == STREAM_STATE_BUFF_HASH_SORT) { pFileState->rowStateBuff = tSimpleHashInit(cap, hashFn); pFileState->searchBuff = tSimpleHashInit(MIN_NUM_SEARCH_BUCKET, hashFn); QUERY_CHECK_NULL(pFileState->searchBuff, code, lino, _end, terrno); pFileState->stateBuffCleanupFn = stateHashBuffCleanupFn; pFileState->stateBuffRemoveFn = stateHashBuffRemoveFn; pFileState->stateBuffRemoveByPosFn = stateHashBuffRemoveByPosFn; pFileState->stateBuffCreateStateKeyFn = defaultCreateStateKey; pFileState->stateFileRemoveFn = hashSortFileRemoveFn; pFileState->stateFileGetFn = hashSortFileGetFn; pFileState->cfName = taosStrdup("fill"); pFileState->stateFunctionGetFn = NULL; } QUERY_CHECK_NULL(pFileState->usedBuffs, code, lino, _end, terrno); QUERY_CHECK_NULL(pFileState->freeBuffs, code, lino, _end, terrno); QUERY_CHECK_NULL(pFileState->rowStateBuff, code, lino, _end, terrno); QUERY_CHECK_NULL(pFileState->cfName, code, lino, _end, terrno); if (type == STREAM_STATE_BUFF_HASH_SEARCH) { pFileState->searchBuff = tSimpleHashInit(MIN_NUM_SEARCH_BUCKET, hashFn); QUERY_CHECK_NULL(pFileState->searchBuff, code, lino, _end, terrno); } pFileState->keyLen = keySize; pFileState->rowSize = rowSize; pFileState->selectivityRowSize = selectRowSize; pFileState->preCheckPointVersion = 0; pFileState->checkPointVersion = 1; pFileState->pFileStore = pFile; pFileState->getTs = fp; pFileState->curRowCount = 0; pFileState->deleteMark = delMark; pFileState->flushMark = INT64_MIN; pFileState->maxTs = INT64_MIN; pFileState->id = taosStrdup(taskId); QUERY_CHECK_NULL(pFileState->id, code, lino, _end, terrno); pFileState->pGroupIdMap = tSimpleHashInit(1024, hashFn); QUERY_CHECK_NULL(pFileState->pGroupIdMap, code, lino, _end, terrno); pFileState->pRecFlagMap = tSimpleHashInit(1024, hashFn); QUERY_CHECK_NULL(pFileState->pRecFlagMap, code, lino, _end, terrno); pFileState->hasFillCatch = true; if (type == STREAM_STATE_BUFF_HASH || type == STREAM_STATE_BUFF_HASH_SEARCH) { code = recoverSnapshot(pFileState, checkpointId); } else if (type == STREAM_STATE_BUFF_SORT) { code = recoverSession(pFileState, checkpointId); } else if (type == STREAM_STATE_BUFF_HASH_SORT) { code = recoverFillSnapshot(pFileState, checkpointId); } QUERY_CHECK_CODE(code, lino, _end); void* valBuf = NULL; int32_t len = 0; int32_t tmpRes = streamDefaultGet_rocksdb(pFileState->pFileStore, STREAM_STATE_INFO_NAME, &valBuf, &len); if (tmpRes == TSDB_CODE_SUCCESS) { QUERY_CHECK_CONDITION((len == sizeof(TSKEY)), code, lino, _end, TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR); streamFileStateDecode(&pFileState->flushMark, valBuf, len); qDebug("===stream===flushMark read:%" PRId64, pFileState->flushMark); } taosMemoryFreeClear(valBuf); (*ppFileState) = pFileState; _end: if (code != TSDB_CODE_SUCCESS) { streamFileStateDestroy(pFileState); qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void destroyRowBuffPos(SRowBuffPos* pPos) { taosMemoryFreeClear(pPos->pKey); taosMemoryFreeClear(pPos->pRowBuff); taosMemoryFree(pPos); } void destroyRowBuffPosPtr(void* ptr) { if (!ptr) { return; } SRowBuffPos* pPos = *(SRowBuffPos**)ptr; if (!pPos->beUsed) { destroyRowBuffPos(pPos); } } void destroyRowBuffAllPosPtr(void* ptr) { if (!ptr) { return; } SRowBuffPos* pPos = *(SRowBuffPos**)ptr; destroyRowBuffPos(pPos); } void destroyRowBuff(void* ptr) { if (!ptr) { return; } taosMemoryFree(*(void**)ptr); } void streamFileStateDestroy(SStreamFileState* pFileState) { if (!pFileState) { return; } taosMemoryFree(pFileState->id); taosMemoryFree(pFileState->cfName); tdListFreeP(pFileState->usedBuffs, destroyRowBuffAllPosPtr); tdListFreeP(pFileState->freeBuffs, destroyRowBuff); pFileState->stateBuffCleanupFn(pFileState->rowStateBuff); sessionWinStateCleanup(pFileState->searchBuff); tSimpleHashCleanup(pFileState->pGroupIdMap); tSimpleHashCleanup(pFileState->pRecFlagMap); taosMemoryFree(pFileState); } int32_t putFreeBuff(SStreamFileState* pFileState, SRowBuffPos* pPos) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pPos->pRowBuff) { code = tdListAppend(pFileState->freeBuffs, &(pPos->pRowBuff)); QUERY_CHECK_CODE(code, lino, _end); pPos->pRowBuff = NULL; } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void clearExpiredRowBuff(SStreamFileState* pFileState, TSKEY ts, bool all) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SListIter iter = {0}; tdListInitIter(pFileState->usedBuffs, &iter, TD_LIST_FORWARD); SListNode* pNode = NULL; while ((pNode = tdListNext(&iter)) != NULL) { SRowBuffPos* pPos = *(SRowBuffPos**)(pNode->data); if (all || (pFileState->getTs(pPos->pKey) < ts && !pPos->beUsed)) { code = putFreeBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); if (!all) { pFileState->stateBuffRemoveByPosFn(pFileState, pPos); } destroyRowBuffPos(pPos); SListNode* tmp = tdListPopNode(pFileState->usedBuffs, pNode); taosMemoryFreeClear(tmp); } } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } } int32_t clearFlushedRowBuff(SStreamFileState* pFileState, SStreamSnapshot* pFlushList, uint64_t max, bool all) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; uint64_t i = 0; SListIter iter = {0}; tdListInitIter(pFileState->usedBuffs, &iter, TD_LIST_FORWARD); SListNode* pNode = NULL; while ((pNode = tdListNext(&iter)) != NULL && i < max) { SRowBuffPos* pPos = *(SRowBuffPos**)pNode->data; if (isFlushedState(pFileState, pFileState->getTs(pPos->pKey), 0)) { if (all || !pPos->beUsed) { if (all && !pPos->pRowBuff) { continue; } code = tdListAppend(pFlushList, &pPos); QUERY_CHECK_CODE(code, lino, _end); pFileState->flushMark = TMAX(pFileState->flushMark, pFileState->getTs(pPos->pKey)); pFileState->stateBuffRemoveByPosFn(pFileState, pPos); if (pPos->beUsed == false) { SListNode* tmp = tdListPopNode(pFileState->usedBuffs, pNode); taosMemoryFreeClear(tmp); } if (pPos->pRowBuff) { i++; } } } } qDebug("clear flushed row buff. %d rows to disk. is all:%d", listNEles(pFlushList), all); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void streamFileStateClear(SStreamFileState* pFileState) { pFileState->flushMark = INT64_MIN; pFileState->maxTs = INT64_MIN; tSimpleHashClear(pFileState->rowStateBuff); clearExpiredRowBuff(pFileState, 0, true); } bool needClearDiskBuff(SStreamFileState* pFileState) { return pFileState->flushMark > 0; } void streamFileStateReleaseBuff(SStreamFileState* pFileState, SRowBuffPos* pPos, bool used) { pPos->beUsed = used; } int32_t popUsedBuffs(SStreamFileState* pFileState, SStreamSnapshot* pFlushList, uint64_t max, bool used) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; uint64_t i = 0; SListIter iter = {0}; tdListInitIter(pFileState->usedBuffs, &iter, TD_LIST_FORWARD); SListNode* pNode = NULL; while ((pNode = tdListNext(&iter)) != NULL && i < max) { SRowBuffPos* pPos = *(SRowBuffPos**)pNode->data; if (pPos->beUsed == used) { if (used && !pPos->pRowBuff) { continue; } code = tdListAppend(pFlushList, &pPos); QUERY_CHECK_CODE(code, lino, _end); pFileState->flushMark = TMAX(pFileState->flushMark, pFileState->getTs(pPos->pKey)); pFileState->stateBuffRemoveByPosFn(pFileState, pPos); if (pPos->beUsed == false) { SListNode* tmp = tdListPopNode(pFileState->usedBuffs, pNode); taosMemoryFreeClear(tmp); } if (pPos->pRowBuff) { i++; } } } qInfo("%s stream state flush %d rows to disk. is used:%d", pFileState->id, listNEles(pFlushList), used); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t flushRowBuff(SStreamFileState* pFileState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SStreamSnapshot* pFlushList = tdListNew(POINTER_BYTES); if (!pFlushList) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } uint64_t num = (uint64_t)(pFileState->curRowCount * FLUSH_RATIO); num = TMAX(num, FLUSH_NUM); code = clearFlushedRowBuff(pFileState, pFlushList, num, false); QUERY_CHECK_CODE(code, lino, _end); if (isListEmpty(pFlushList)) { code = popUsedBuffs(pFileState, pFlushList, num, false); QUERY_CHECK_CODE(code, lino, _end); if (isListEmpty(pFlushList)) { code = popUsedBuffs(pFileState, pFlushList, num, true); QUERY_CHECK_CODE(code, lino, _end); } } if (pFileState->searchBuff) { code = clearFlushedRowBuff(pFileState, pFlushList, pFileState->curRowCount, true); QUERY_CHECK_CODE(code, lino, _end); } flushSnapshot(pFileState, pFlushList, false); SListIter fIter = {0}; tdListInitIter(pFlushList, &fIter, TD_LIST_FORWARD); SListNode* pNode = NULL; while ((pNode = tdListNext(&fIter)) != NULL) { SRowBuffPos* pPos = *(SRowBuffPos**)pNode->data; code = putFreeBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); } tdListFreeP(pFlushList, destroyRowBuffPosPtr); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t clearRowBuff(SStreamFileState* pFileState) { if (pFileState->deleteMark != INT64_MAX) { clearExpiredRowBuff(pFileState, pFileState->maxTs - pFileState->deleteMark, false); } do { int32_t code = flushRowBuff(pFileState); if (code != TSDB_CODE_SUCCESS) { return code; } } while (isListEmpty(pFileState->freeBuffs) && pFileState->curRowCount == pFileState->maxRowCount); return TSDB_CODE_SUCCESS; } int32_t clearFlushedRowBuffByFlag(SStreamFileState* pFileState, uint64_t max) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; uint64_t i = 0; SListIter iter = {0}; tdListInitIter(pFileState->usedBuffs, &iter, TD_LIST_FORWARD); SListNode* pNode = NULL; while ((pNode = tdListNext(&iter)) != NULL && i < max) { SRowBuffPos* pPos = *(SRowBuffPos**)pNode->data; if (pPos->invalid) { if (!pPos->beUsed) { SListNode* tmp = tdListPopNode(pFileState->usedBuffs, pNode); taosMemoryFreeClear(tmp); if (pPos->pRowBuff) { i++; } code = putFreeBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); destroyRowBuffPos(pPos); } } } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t clearRowBuffNonFlush(SStreamFileState* pFileState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pFileState->deleteMark != INT64_MAX) { clearExpiredRowBuff(pFileState, pFileState->maxTs - pFileState->deleteMark, false); } uint64_t num = (uint64_t)(pFileState->curRowCount * FLUSH_RATIO); num = TMAX(num, FLUSH_NUM); code = clearFlushedRowBuffByFlag(pFileState, num); QUERY_CHECK_CODE(code, lino, _end); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void* getFreeBuff(SStreamFileState* pFileState) { SList* lists = pFileState->freeBuffs; int32_t buffSize = pFileState->rowSize; SListNode* pNode = tdListPopHead(lists); if (!pNode) { return NULL; } void* ptr = *(void**)pNode->data; memset(ptr, 0, buffSize); taosMemoryFree(pNode); return ptr; } void streamFileStateClearBuff(SStreamFileState* pFileState, SRowBuffPos* pPos) { if (pPos->pRowBuff) { memset(pPos->pRowBuff, 0, pFileState->rowSize); } } SRowBuffPos* getNewRowPos(SStreamFileState* pFileState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SRowBuffPos* pPos = taosMemoryCalloc(1, sizeof(SRowBuffPos)); if (!pPos) { code = terrno; QUERY_CHECK_CODE(code, lino, _error); } pPos->pKey = taosMemoryCalloc(1, pFileState->keyLen); if (!pPos->pKey) { code = terrno; QUERY_CHECK_CODE(code, lino, _error); } void* pBuff = getFreeBuff(pFileState); if (pBuff) { pPos->pRowBuff = pBuff; goto _end; } if (pFileState->curRowCount < pFileState->maxRowCount) { pBuff = taosMemoryCalloc(1, pFileState->rowSize); QUERY_CHECK_NULL(pBuff, code, lino, _error, terrno); pPos->pRowBuff = pBuff; pFileState->curRowCount++; goto _end; } code = clearRowBuff(pFileState); QUERY_CHECK_CODE(code, lino, _error); pPos->pRowBuff = getFreeBuff(pFileState); _end: code = tdListAppend(pFileState->usedBuffs, &pPos); QUERY_CHECK_CODE(code, lino, _error); QUERY_CHECK_CONDITION((pPos->pRowBuff != NULL), code, lino, _error, TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR); _error: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); return NULL; } return pPos; } SRowBuffPos* getNewRowPosForWrite(SStreamFileState* pFileState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SRowBuffPos* newPos = getNewRowPos(pFileState); if (!newPos) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _error); } newPos->beUsed = true; newPos->beFlushed = false; newPos->needFree = false; newPos->beUpdated = true; newPos->invalid = false; return newPos; _error: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return NULL; } int32_t addRowBuffIfNotExist(SStreamFileState* pFileState, void* pKey, int32_t keyLen, void** pVal, int32_t* pVLen, int32_t* pWinCode) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; (*pWinCode) = TSDB_CODE_SUCCESS; pFileState->maxTs = TMAX(pFileState->maxTs, pFileState->getTs(pKey)); SRowBuffPos** pos = tSimpleHashGet(pFileState->rowStateBuff, pKey, keyLen); if (pos) { if (pVal != NULL) { *pVLen = pFileState->rowSize; *pVal = *pos; (*pos)->beUsed = true; (*pos)->beFlushed = false; } goto _end; } SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pKey, pKey, keyLen); (*pWinCode) = TSDB_CODE_FAILED; TSKEY ts = pFileState->getTs(pKey); if (!isDeteled(pFileState, ts) && isFlushedState(pFileState, ts, 0)) { int32_t len = 0; void* p = NULL; (*pWinCode) = pFileState->stateFileGetFn(pFileState, pKey, &p, &len); qDebug("===stream===get %" PRId64 " from disc, res %d", ts, (*pWinCode)); if ((*pWinCode) == TSDB_CODE_SUCCESS) { memcpy(pNewPos->pRowBuff, p, len); } taosMemoryFree(p); } code = tSimpleHashPut(pFileState->rowStateBuff, pKey, keyLen, &pNewPos, POINTER_BYTES); QUERY_CHECK_CODE(code, lino, _end); if (pVal) { *pVLen = pFileState->rowSize; *pVal = pNewPos; } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t createRowBuff(SStreamFileState* pFileState, void* pKey, int32_t keyLen, void** pVal, int32_t* pVLen) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; pFileState->maxTs = TMAX(pFileState->maxTs, pFileState->getTs(pKey)); SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pKey, pKey, keyLen); code = tSimpleHashPut(pFileState->rowStateBuff, pKey, keyLen, &pNewPos, POINTER_BYTES); QUERY_CHECK_CODE(code, lino, _end); if (pVal) { *pVLen = pFileState->rowSize; *pVal = pNewPos; } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void deleteRowBuff(SStreamFileState* pFileState, const void* pKey, int32_t keyLen) { int32_t code_buff = pFileState->stateBuffRemoveFn(pFileState->rowStateBuff, pKey, keyLen); qTrace("%s at line %d res:%d", __func__, __LINE__, code_buff); int32_t code_file = pFileState->stateFileRemoveFn(pFileState, pKey); qTrace("%s at line %d res:%d", __func__, __LINE__, code_file); if (pFileState->searchBuff != NULL) { deleteHashSortRowBuff(pFileState, pKey); } } void deleteRowBuffByGroupId(SStreamFileState* pFileState, uint64_t groupId) { SSHashObj* pRowMap = pFileState->rowStateBuff; void* pIte = NULL; int32_t iter = 0; while ((pIte = tSimpleHashIterate(pRowMap, pIte, &iter)) != NULL) { size_t keyLen = 0; SWinKey* pKey = tSimpleHashGetKey(pIte, &keyLen); if (pKey->groupId == groupId) { int32_t tmpRes = tSimpleHashIterateRemove(pRowMap, pKey, keyLen, &pIte, &iter); qTrace("%s at line %d res:%d", __func__, __LINE__, tmpRes); } } while (1) { SWinKey tmp = {.ts = INT64_MIN, .groupId = groupId}; SStreamStateCur* pCur = streamStateSeekKeyNext_rocksdb(pFileState->pFileStore, &tmp); SWinKey delKey = {.groupId = groupId}; int32_t code = streamStateGetGroupKVByCur_rocksdb(pFileState->pFileStore, pCur, &delKey, NULL, 0); if (code != TSDB_CODE_SUCCESS) { break; } code = streamStateDel_rocksdb(pFileState->pFileStore, &delKey); qTrace("%s at line %d res:%d", __func__, __LINE__, code); } } static int32_t recoverSessionRowBuff(SStreamFileState* pFileState, SRowBuffPos* pPos) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; int32_t len = 0; void* pBuff = NULL; code = pFileState->stateFileGetFn(pFileState, pPos->pKey, &pBuff, &len); QUERY_CHECK_CODE(code, lino, _end); memcpy(pPos->pRowBuff, pBuff, len); taosMemoryFree(pBuff); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } static int32_t recoverStateRowBuff(SStreamFileState* pFileState, SRowBuffPos* pPos) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; pPos->pRowBuff = getFreeBuff(pFileState); if (!pPos->pRowBuff) { if (pFileState->curRowCount < pFileState->maxRowCount) { pPos->pRowBuff = taosMemoryCalloc(1, pFileState->rowSize); if (!pPos->pRowBuff) { code = terrno; QUERY_CHECK_CODE(code, lino, _end); } pFileState->curRowCount++; } else { code = clearRowBuff(pFileState); QUERY_CHECK_CODE(code, lino, _end); pPos->pRowBuff = getFreeBuff(pFileState); } QUERY_CHECK_CONDITION((pPos->pRowBuff != NULL), code, lino, _end, TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR); } code = recoverSessionRowBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t getRowBuffByPos(SStreamFileState* pFileState, SRowBuffPos* pPos, void** pVal) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pPos->pRowBuff) { if (pPos->needFree) { code = recoverSessionRowBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); } (*pVal) = pPos->pRowBuff; goto _end; } code = recoverStateRowBuff(pFileState, pPos); QUERY_CHECK_CODE(code, lino, _end); (*pVal) = pPos->pRowBuff; // if (!pPos->needFree) { // code = tdListPrepend(pFileState->usedBuffs, &pPos); // QUERY_CHECK_CODE(code, lino, _end); // } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } bool hasRowBuff(SStreamFileState* pFileState, const SWinKey* pKey, bool hasLimit, bool* pIsLast) { bool res = false; if (pIsLast != NULL) { (*pIsLast) = false; } SRowBuffPos** pos = tSimpleHashGet(pFileState->rowStateBuff, pKey, sizeof(SWinKey)); if (pos) { res = true; } void* pSearchBuff = getSearchBuff(pFileState); if (pSearchBuff != NULL) { void** ppBuff = (void**)tSimpleHashGet(pSearchBuff, &pKey->groupId, sizeof(uint64_t)); if (ppBuff != NULL) { SArray* pWinStates = (SArray*)(*ppBuff); if (pIsLast != NULL) { SWinKey* pLastKey = (SWinKey*)taosArrayGetLast(pWinStates); *pIsLast = (winKeyCmprImpl(pKey, pLastKey) == 0); } if (hasLimit && taosArrayGetSize(pWinStates) <= MIN_NUM_OF_SORT_CACHE_WIN) { res = true; } if (qDebugFlag & DEBUG_DEBUG) { if (taosArrayGetSize(pWinStates) > 0) { SWinKey* fistKey = (SWinKey*)taosArrayGet(pWinStates, 0); qDebug("===stream===check window state. buff min ts:%" PRId64 ",groupId:%" PRIu64 ".key ts:%" PRId64 ",groupId:%" PRIu64, fistKey->ts, fistKey->groupId, pKey->ts, pKey->groupId); } } } else { res = true; } } return res; } SStreamSnapshot* getSnapshot(SStreamFileState* pFileState) { int64_t mark = (pFileState->deleteMark == INT64_MAX || pFileState->maxTs == INT64_MIN) ? INT64_MIN : pFileState->maxTs - pFileState->deleteMark; clearExpiredRowBuff(pFileState, mark, false); return pFileState->usedBuffs; } void flushSnapshot(SStreamFileState* pFileState, SStreamSnapshot* pSnapshot, bool flushState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SListIter iter = {0}; tdListInitIter(pSnapshot, &iter, TD_LIST_FORWARD); int64_t st = taosGetTimestampMs(); SListNode* pNode = NULL; int idx = streamStateGetCfIdx(pFileState->pFileStore, pFileState->cfName); int32_t len = (pFileState->rowSize + sizeof(uint64_t) + sizeof(int32_t) + 64) * 2; char* buf = taosMemoryCalloc(1, len); if (!buf) { code = terrno; QUERY_CHECK_CODE(code, lino, _end); } void* batch = streamStateCreateBatch(); if (!batch) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } while ((pNode = tdListNext(&iter)) != NULL && code == TSDB_CODE_SUCCESS) { SRowBuffPos* pPos = *(SRowBuffPos**)pNode->data; if (pPos->beFlushed || !pPos->pRowBuff) { continue; } pPos->beFlushed = true; pFileState->flushMark = TMAX(pFileState->flushMark, pFileState->getTs(pPos->pKey)); qDebug("===stream===flushed start:%" PRId64, pFileState->getTs(pPos->pKey)); if (streamStateGetBatchSize(batch) >= BATCH_LIMIT) { code = streamStatePutBatch_rocksdb(pFileState->pFileStore, batch); streamStateClearBatch(batch); QUERY_CHECK_CODE(code, lino, _end); } void* pSKey = pFileState->stateBuffCreateStateKeyFn(pPos, ((SStreamState*)pFileState->pFileStore)->number); QUERY_CHECK_NULL(pSKey, code, lino, _end, terrno); code = streamStatePutBatchOptimize(pFileState->pFileStore, idx, batch, pSKey, pPos->pRowBuff, pFileState->rowSize, 0, buf); taosMemoryFreeClear(pSKey); QUERY_CHECK_CODE(code, lino, _end); // todo handle failure memset(buf, 0, len); } taosMemoryFreeClear(buf); int32_t numOfElems = streamStateGetBatchSize(batch); if (numOfElems > 0) { code = streamStatePutBatch_rocksdb(pFileState->pFileStore, batch); QUERY_CHECK_CODE(code, lino, _end); } else { goto _end; } streamStateClearBatch(batch); clearSearchBuff(pFileState); int64_t elapsed = taosGetTimestampMs() - st; qDebug("%s flush to disk in batch model completed, rows:%d, batch size:%d, elapsed time:%" PRId64 "ms", pFileState->id, numOfElems, BATCH_LIMIT, elapsed); if (flushState) { void* valBuf = NULL; int32_t len = 0; code = streamFileStateEncode(&pFileState->flushMark, &valBuf, &len); QUERY_CHECK_CODE(code, lino, _end); qDebug("===stream===flushMark write:%" PRId64, pFileState->flushMark); code = streamStatePutBatch(pFileState->pFileStore, "default", batch, STREAM_STATE_INFO_NAME, valBuf, len, 0); taosMemoryFree(valBuf); QUERY_CHECK_CODE(code, lino, _end); code = streamStatePutBatch_rocksdb(pFileState->pFileStore, batch); QUERY_CHECK_CODE(code, lino, _end); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } taosMemoryFree(buf); streamStateDestroyBatch(batch); } int32_t forceRemoveCheckpoint(SStreamFileState* pFileState, int64_t checkpointId) { char keyBuf[128] = {0}; TAOS_UNUSED(tsnprintf(keyBuf, sizeof(keyBuf), "%s:%" PRId64, TASK_KEY, checkpointId)); return streamDefaultDel_rocksdb(pFileState->pFileStore, keyBuf); } int32_t deleteExpiredCheckPoint(SStreamFileState* pFileState, TSKEY mark) { int32_t code = TSDB_CODE_SUCCESS; int64_t maxCheckPointId = 0; { char buf[128] = {0}; void* val = NULL; int32_t len = 0; memcpy(buf, TASK_KEY, strlen(TASK_KEY)); code = streamDefaultGet_rocksdb(pFileState->pFileStore, buf, &val, &len); if (code != 0 || len == 0 || val == NULL) { return TSDB_CODE_FAILED; } memcpy(buf, val, len); buf[len] = 0; maxCheckPointId = taosStr2Int64((char*)buf, NULL, 10); taosMemoryFree(val); } for (int64_t i = maxCheckPointId; i > 0; i--) { char buf[128] = {0}; void* val = 0; int32_t len = 0; TAOS_UNUSED(tsnprintf(buf, sizeof(buf), "%s:%" PRId64, TASK_KEY, i)); code = streamDefaultGet_rocksdb(pFileState->pFileStore, buf, &val, &len); if (code != 0) { return TSDB_CODE_FAILED; } memcpy(buf, val, len); buf[len] = 0; taosMemoryFree(val); TSKEY ts; ts = taosStr2Int64((char*)buf, NULL, 10); if (ts < mark) { // statekey winkey.ts < mark int32_t tmpRes = forceRemoveCheckpoint(pFileState, i); qTrace("%s at line %d res:%d", __func__, __LINE__, tmpRes); break; } } return code; } int32_t recoverSession(SStreamFileState* pFileState, int64_t ckId) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; int32_t winRes = TSDB_CODE_SUCCESS; if (pFileState->maxTs != INT64_MIN) { int64_t mark = (INT64_MIN + pFileState->deleteMark >= pFileState->maxTs) ? INT64_MIN : pFileState->maxTs - pFileState->deleteMark; int32_t tmpRes = deleteExpiredCheckPoint(pFileState, mark); qTrace("%s at line %d res:%d", __func__, __LINE__, tmpRes); } SStreamStateCur* pCur = streamStateSessionSeekToLast_rocksdb(pFileState->pFileStore, INT64_MAX); int32_t recoverNum = TMIN(MIN_NUM_OF_RECOVER_ROW_BUFF, pFileState->maxRowCount); while (winRes == TSDB_CODE_SUCCESS) { if (pFileState->curRowCount >= recoverNum) { break; } void* pVal = NULL; int32_t vlen = 0; SSessionKey key = {0}; winRes = streamStateSessionGetKVByCur_rocksdb(getStateFileStore(pFileState), pCur, &key, &pVal, &vlen); if (winRes != TSDB_CODE_SUCCESS) { break; } if (vlen != pFileState->rowSize) { code = TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR; qError("[InternalERR] read key:[skey:%"PRId64 ",ekey:%"PRId64 ",groupId:%"PRIu64 "],vlen:%d, rowSize:%d", key.win.skey, key.win.ekey, key.groupId, vlen, pFileState->rowSize); QUERY_CHECK_CODE(code, lino, _end); } SRowBuffPos* pPos = createSessionWinBuff(pFileState, &key, pVal, &vlen); pPos->beUsed = false; winRes = putSessionWinResultBuff(pFileState, pPos); if (winRes != TSDB_CODE_SUCCESS) { break; } winRes = streamStateSessionCurPrev_rocksdb(pCur); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } streamStateFreeCur(pCur); return code; } int32_t recoverSnapshot(SStreamFileState* pFileState, int64_t ckId) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; int32_t winCode = TSDB_CODE_SUCCESS; if (pFileState->maxTs != INT64_MIN) { int64_t mark = (INT64_MIN + pFileState->deleteMark >= pFileState->maxTs) ? INT64_MIN : pFileState->maxTs - pFileState->deleteMark; int32_t tmpRes = deleteExpiredCheckPoint(pFileState, mark); qTrace("%s at line %d res:%d", __func__, __LINE__, tmpRes); } SStreamStateCur* pCur = streamStateSeekToLast_rocksdb(pFileState->pFileStore); int32_t recoverNum = TMIN(MIN_NUM_OF_RECOVER_ROW_BUFF, pFileState->maxRowCount); while (winCode == TSDB_CODE_SUCCESS) { if (pFileState->curRowCount >= recoverNum) { break; } void* pVal = NULL; int32_t vlen = 0; SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } winCode = streamStateGetKVByCur_rocksdb(getStateFileStore(pFileState), pCur, pNewPos->pKey, (const void**)&pVal, &vlen); qDebug("===stream=== get state by cur winres:%d. %s", winCode, __func__); if (winCode != TSDB_CODE_SUCCESS || pFileState->getTs(pNewPos->pKey) < pFileState->flushMark) { destroyRowBuffPos(pNewPos); SListNode* pNode = tdListPopTail(pFileState->usedBuffs); taosMemoryFreeClear(pNode); taosMemoryFreeClear(pVal); break; } if (vlen != pFileState->rowSize) { qError("row size mismatch, expect:%d, actual:%d", pFileState->rowSize, vlen); code = TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR; taosMemoryFreeClear(pVal); QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pRowBuff, pVal, vlen); taosMemoryFreeClear(pVal); pNewPos->beFlushed = true; pNewPos->beUsed = false; qDebug("===stream=== read checkpoint state from disc. %s", __func__); code = tSimpleHashPut(pFileState->rowStateBuff, pNewPos->pKey, pFileState->keyLen, &pNewPos, POINTER_BYTES); if (code != TSDB_CODE_SUCCESS) { destroyRowBuffPos(pNewPos); break; } streamStateCurPrev_rocksdb(pCur); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } streamStateFreeCur(pCur); return code; } int32_t streamFileStateGetSelectRowSize(SStreamFileState* pFileState) { return pFileState->selectivityRowSize; } void streamFileStateReloadInfo(SStreamFileState* pFileState, TSKEY ts) { pFileState->flushMark = TMAX(pFileState->flushMark, ts); pFileState->maxTs = TMAX(pFileState->maxTs, ts); } void* getRowStateBuff(SStreamFileState* pFileState) { return pFileState->rowStateBuff; } void* getSearchBuff(SStreamFileState* pFileState) { return pFileState->searchBuff; } void* getStateFileStore(SStreamFileState* pFileState) { return pFileState->pFileStore; } bool isDeteled(SStreamFileState* pFileState, TSKEY ts) { return pFileState->deleteMark != INT64_MAX && pFileState->maxTs > 0 && ts < (pFileState->maxTs - pFileState->deleteMark); } bool isFlushedState(SStreamFileState* pFileState, TSKEY ts, TSKEY gap) { return ts <= (pFileState->flushMark + gap); } TSKEY getFlushMark(SStreamFileState* pFileState) { return pFileState->flushMark; }; int32_t getRowStateRowSize(SStreamFileState* pFileState) { return pFileState->rowSize; } int32_t getFunctionRowBuff(SStreamFileState* pFileState, void* pKey, int32_t keyLen, void** pVal, int32_t* pVLen) { int32_t winCode = TSDB_CODE_SUCCESS; return pFileState->stateFunctionGetFn(pFileState, pKey, keyLen, pVal, pVLen, &winCode); } int32_t recoverFillSnapshot(SStreamFileState* pFileState, int64_t ckId) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (pFileState->maxTs != INT64_MIN) { int64_t mark = (INT64_MIN + pFileState->deleteMark >= pFileState->maxTs) ? INT64_MIN : pFileState->maxTs - pFileState->deleteMark; code = deleteExpiredCheckPoint(pFileState, mark); QUERY_CHECK_CODE(code, lino, _end); } SStreamStateCur* pCur = streamStateFillSeekToLast_rocksdb(pFileState->pFileStore); if (pCur == NULL) { return code; } int32_t recoverNum = TMIN(MIN_NUM_OF_RECOVER_ROW_BUFF, pFileState->maxRowCount); int32_t winRes = TSDB_CODE_SUCCESS; while (winRes == TSDB_CODE_SUCCESS) { if (pFileState->curRowCount >= recoverNum) { break; } void* pVal = NULL; int32_t vlen = 0; SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); winRes = streamStateFillGetKVByCur_rocksdb(pCur, pNewPos->pKey, (const void**)&pVal, &vlen); qDebug("===stream=== get state by cur winres:%d. %s", winRes, __func__); if (winRes != TSDB_CODE_SUCCESS || isFlushedState(pFileState, pFileState->getTs(pNewPos->pKey), 0)) { destroyRowBuffPos(pNewPos); SListNode* pNode = tdListPopTail(pFileState->usedBuffs); taosMemoryFreeClear(pNode); taosMemoryFreeClear(pVal); break; } if (vlen != pFileState->rowSize) { qError("row size mismatch, expect:%d, actual:%d", pFileState->rowSize, vlen); destroyRowBuffPos(pNewPos); code = TSDB_CODE_QRY_EXECUTOR_INTERNAL_ERROR; taosMemoryFreeClear(pVal); QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pRowBuff, pVal, vlen); taosMemoryFreeClear(pVal); pNewPos->beFlushed = true; pNewPos->beUsed = false; qDebug("===stream=== read checkpoint state from disc. %s", __func__); winRes = tSimpleHashPut(pFileState->rowStateBuff, pNewPos->pKey, pFileState->keyLen, &pNewPos, POINTER_BYTES); if (winRes != TSDB_CODE_SUCCESS) { destroyRowBuffPos(pNewPos); break; } streamStateCurPrev_rocksdb(pCur); } streamStateFreeCur(pCur); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t getRowBuff(SStreamFileState* pFileState, void* pKey, int32_t keyLen, void** pVal, int32_t* pVLen, int32_t* pWinCode) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; (*pWinCode) = TSDB_CODE_FAILED; pFileState->maxTs = TMAX(pFileState->maxTs, pFileState->getTs(pKey)); SRowBuffPos** ppPos = tSimpleHashGet(pFileState->rowStateBuff, pKey, keyLen); if (ppPos) { *pVLen = pFileState->rowSize; *pVal = *ppPos; (*ppPos)->beUsed = true; (*ppPos)->beFlushed = false; (*pWinCode) = TSDB_CODE_SUCCESS; if ((*ppPos)->pRowBuff == NULL) { code = recoverStateRowBuff(pFileState, *ppPos); QUERY_CHECK_CODE(code, lino, _end); } goto _end; } TSKEY ts = pFileState->getTs(pKey); if (!isDeteled(pFileState, ts) && isFlushedState(pFileState, ts, 0)) { int32_t len = 0; void* p = NULL; (*pWinCode) = pFileState->stateFileGetFn(pFileState, pKey, &p, &len); qDebug("===stream===get %" PRId64 " from disc, res %d", ts, (*pWinCode)); if ((*pWinCode) == TSDB_CODE_SUCCESS) { SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pKey, pKey, keyLen); memcpy(pNewPos->pRowBuff, p, len); code = tSimpleHashPut(pFileState->rowStateBuff, pKey, keyLen, &pNewPos, POINTER_BYTES); QUERY_CHECK_CODE(code, lino, _end); if (pVal) { *pVLen = pFileState->rowSize; *pVal = pNewPos; } } taosMemoryFree(p); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t streamFileStateGroupPut(SStreamFileState* pFileState, int64_t groupId, void* value, int32_t vLen) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; if (value != NULL) { code = TSDB_CODE_INVALID_PARA; QUERY_CHECK_CODE(code, lino, _end); } if (tSimpleHashGet(pFileState->pGroupIdMap, &groupId, sizeof(int64_t)) == NULL) { if (tSimpleHashGetSize(pFileState->pGroupIdMap) <= MAX_GROUP_ID_NUM) { code = tSimpleHashPut(pFileState->pGroupIdMap, &groupId, sizeof(int64_t), NULL, 0); QUERY_CHECK_CODE(code, lino, _end); } code = streamStatePutParTag_rocksdb(pFileState->pFileStore, groupId, value, vLen); QUERY_CHECK_CODE(code, lino, _end); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } void streamFileStateGroupCurNext(SStreamStateCur* pCur) { SStreamFileState* pFileState = (SStreamFileState*)pCur->pStreamFileState; if (pCur->hashIter == -1) { streamStateCurNext(pFileState->pFileStore, pCur); return; } int64_t gpId = *(int64_t*)tSimpleHashGetKey(pCur->pHashData, NULL); pCur->minGpId = TMAX(pCur->minGpId, gpId); SSHashObj* pHash = pFileState->pGroupIdMap; pCur->pHashData = tSimpleHashIterate(pHash, pCur->pHashData, &pCur->hashIter); if (!pCur->pHashData) { pCur->hashIter = -1; streamStateParTagSeekKeyNext_rocksdb(pFileState->pFileStore, pCur->minGpId, pCur); return; } } int32_t streamFileStateGroupGetKVByCur(SStreamStateCur* pCur, int64_t* pKey, void** pVal, int32_t* pVLen) { int32_t code = TSDB_CODE_SUCCESS; if (pCur->pHashData) { *pKey = *(int64_t*)tSimpleHashGetKey(pCur->pHashData, NULL); return code; } return streamStateParTagGetKVByCur_rocksdb(pCur, pKey, NULL, NULL); } SSHashObj* getGroupIdCache(SStreamFileState* pFileState) { return pFileState->pGroupIdMap; } void clearExpiredState(SStreamFileState* pFileState, int32_t numOfKeep, TSKEY minTs) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SSHashObj* pSearchBuff = pFileState->searchBuff; void* pIte = NULL; int32_t iter = 0; while ((pIte = tSimpleHashIterate(pSearchBuff, pIte, &iter)) != NULL) { SArray* pWinStates = *((void**)pIte); int32_t arraySize = TARRAY_SIZE(pWinStates); if (minTs != INT64_MAX && arraySize > numOfKeep) { SWinKey key = {.ts = minTs}; key.groupId = *(uint64_t*)tSimpleHashGetKey(pIte, NULL); int32_t index = binarySearch(pWinStates, arraySize, &key, fillStateKeyCompare); numOfKeep = TMAX(arraySize - index, MIN_NUM_OF_SORT_CACHE_WIN); qDebug("modify numOfKeep, numOfKeep:%d. %s at line %d", numOfKeep, __func__, __LINE__); } int32_t size = arraySize - numOfKeep; for (int32_t i = 0; i < size; i++) { SWinKey* pKey = taosArrayGet(pWinStates, i); int32_t code_buff = pFileState->stateBuffRemoveFn(pFileState->rowStateBuff, pKey, sizeof(SWinKey)); qTrace("clear expired buff, ts:%" PRId64 ",groupid:%" PRIu64 ". %s at line %d res:%d", pKey->ts, pKey->groupId, __func__, __LINE__, code_buff); if (isFlushedState(pFileState, pKey->ts, 0)) { int32_t code_file = pFileState->stateFileRemoveFn(pFileState, pKey); qTrace("clear expired file, ts:%" PRId64 ". %s at line %d res:%d", pKey->ts, __func__, __LINE__, code_file); } if (tSimpleHashGetSize(pFileState->pRecFlagMap) > 0) { tSimpleHashRemove(pFileState->pRecFlagMap, pKey, sizeof(SWinKey)); } } taosArrayRemoveBatch(pWinStates, 0, size, NULL); } code = clearRowBuffNonFlush(pFileState); QUERY_CHECK_CODE(code, lino, _end); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } } #ifdef BUILD_NO_CALL int32_t getStateSearchRowBuff(SStreamFileState* pFileState, const SWinKey* pKey, void** pVal, int32_t* pVLen, int32_t* pWinCode) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; code = addRowBuffIfNotExist(pFileState, (void*)pKey, sizeof(SWinKey), pVal, pVLen, pWinCode); QUERY_CHECK_CODE(code, lino, _end); SArray* pWinStates = NULL; SSHashObj* pSearchBuff = getSearchBuff(pFileState); void** ppBuff = tSimpleHashGet(pSearchBuff, &pKey->groupId, sizeof(uint64_t)); if (ppBuff) { pWinStates = (SArray*)(*ppBuff); } else { pWinStates = taosArrayInit(16, sizeof(SWinKey)); QUERY_CHECK_NULL(pWinStates, code, lino, _end, terrno); code = tSimpleHashPut(pSearchBuff, &pKey->groupId, sizeof(uint64_t), &pWinStates, POINTER_BYTES); QUERY_CHECK_CODE(code, lino, _end); } // recover if (taosArrayGetSize(pWinStates) == 0 && needClearDiskBuff(pFileState)) { recoverHashSortBuff(pFileState, pWinStates, pKey->groupId); } int32_t size = taosArrayGetSize(pWinStates); int32_t index = binarySearch(pWinStates, size, pKey, fillStateKeyCompare); if (!isFlushedState(pFileState, pKey->ts, 0) || index >= 0) { // find the first position which is smaller than the pKey if (index >= 0) { SWinKey* pTmpKey = taosArrayGet(pWinStates, index); if (winKeyCmprImpl(pTmpKey, pKey) == 0) { goto _end; } } index++; void* tmp = taosArrayInsert(pWinStates, index, pKey); QUERY_CHECK_NULL(tmp, code, lino, _end, terrno); } if (size >= MAX_NUM_OF_CACHE_WIN) { int32_t num = size - NUM_OF_CACHE_WIN; taosArrayRemoveBatch(pWinStates, 0, num, NULL); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } #endif int32_t getRowStatePrevRow(SStreamFileState* pFileState, const SWinKey* pKey, SWinKey* pResKey, void** ppVal, int32_t* pVLen, int32_t* pWinCode) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SArray* pWinStates = NULL; SSHashObj* pSearchBuff = getSearchBuff(pFileState); void* pState = getStateFileStore(pFileState); void** ppBuff = (void**)tSimpleHashGet(pSearchBuff, &pKey->groupId, sizeof(uint64_t)); if (ppBuff) { pWinStates = (SArray*)(*ppBuff); } else if (needClearDiskBuff(pFileState)) { qDebug("===stream=== search buff is empty.group id:%" PRId64, pKey->groupId); SStreamStateCur* pCur = streamStateSeekKeyPrev_rocksdb(pState, pKey); void* tmpVal = NULL; int32_t len = 0; (*pWinCode) = streamStateGetGroupKVByCur_rocksdb(pState, pCur, pResKey, (const void**)&tmpVal, &len); if ((*pWinCode) == TSDB_CODE_SUCCESS) { SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pRowBuff, tmpVal, len); taosMemoryFreeClear(tmpVal); *pVLen = getRowStateRowSize(pFileState); (*ppVal) = pNewPos; } streamStateFreeCur(pCur); return code; } else { (*pWinCode) = TSDB_CODE_FAILED; return code; } int32_t size = taosArrayGetSize(pWinStates); int32_t index = binarySearch(pWinStates, size, pKey, fillStateKeyCompare); if (index >= 0) { SWinKey* pCurKey = taosArrayGet(pWinStates, index); if (winKeyCmprImpl(pCurKey, pKey) == 0) { index--; } else { qDebug("%s failed at line %d since do not find cur SWinKey. trigger may be force window close", __func__, __LINE__); } } if (index == -1) { SStreamStateCur* pCur = streamStateSeekKeyPrev_rocksdb(pState, pKey); void* tmpVal = NULL; int32_t len = 0; (*pWinCode) = streamStateGetGroupKVByCur_rocksdb(pState, pCur, pResKey, (const void**)&tmpVal, &len); if ((*pWinCode) == TSDB_CODE_SUCCESS) { SRowBuffPos* pNewPos = getNewRowPosForWrite(pFileState); if (!pNewPos || !pNewPos->pRowBuff) { code = TSDB_CODE_OUT_OF_MEMORY; QUERY_CHECK_CODE(code, lino, _end); } memcpy(pNewPos->pRowBuff, tmpVal, len); taosMemoryFreeClear(tmpVal); *pVLen = getRowStateRowSize(pFileState); (*ppVal) = pNewPos; } streamStateFreeCur(pCur); return code; } else { SWinKey* pPrevKey = taosArrayGet(pWinStates, index); *pResKey = *pPrevKey; return addRowBuffIfNotExist(pFileState, (void*)pPrevKey, sizeof(SWinKey), ppVal, pVLen, pWinCode); } (*pWinCode) = TSDB_CODE_FAILED; _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t addSearchItem(SStreamFileState* pFileState, SArray* pWinStates, const SWinKey* pKey, bool* pIsEnd) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; int32_t size = taosArrayGetSize(pWinStates); int32_t index = binarySearch(pWinStates, size, pKey, fillTSKeyCompare); if (!isFlushedState(pFileState, pKey->ts, 0) || index >= 0 || size == 0) { if (index >= 0) { SWinKey* pTmpKey = taosArrayGet(pWinStates, index); if (winKeyCmprImpl(pTmpKey, pKey) == 0) { goto _end; } } index++; (*pIsEnd) = (index >= size); void* tmp = taosArrayInsert(pWinStates, index, pKey); QUERY_CHECK_NULL(tmp, code, lino, _end, terrno); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t addArrayBuffIfNotExist(SSHashObj* pSearchBuff, uint64_t groupId, SArray** ppResStates) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SArray* pWinStates = NULL; void** ppBuff = tSimpleHashGet(pSearchBuff, &groupId, sizeof(uint64_t)); if (ppBuff) { pWinStates = (SArray*)(*ppBuff); } else { pWinStates = taosArrayInit(16, sizeof(SWinKey)); QUERY_CHECK_NULL(pWinStates, code, lino, _end, terrno); code = tSimpleHashPut(pSearchBuff, &groupId, sizeof(uint64_t), &pWinStates, POINTER_BYTES); QUERY_CHECK_CODE(code, lino, _end); } (*ppResStates) = pWinStates; _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } static void setValueBuff(TSKEY ts, char* pVal, int32_t len, char* pBuff, int32_t buffLen) { SET_TSDATA_FLAG(pBuff, buffLen); if (len == 0) { *(TSKEY*)pBuff = ts; return; } memset(pBuff, 0, buffLen - 1); *(TSKEY*)pBuff = ts; memcpy(pBuff + sizeof(TSKEY), pVal, len); } int32_t getAndSetTsData(STableTsDataState* pTsDataState, uint64_t tableUid, TSKEY* pCurTs, void** ppCurPkVal, TSKEY lastTs, void* pLastPkVal, int32_t lastPkLen, int32_t* pWinCode) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; bool hasPk = (lastPkLen != 0); TSKEY* pDataVal = tSimpleHashGet(pTsDataState->pTableTsDataMap, &tableUid, sizeof(uint64_t)); if (pDataVal != NULL) { (*pWinCode) = TSDB_CODE_SUCCESS; *pCurTs = *pDataVal; if ((*pCurTs) < lastTs) { setValueBuff(lastTs, pLastPkVal, lastPkLen, (char*)pDataVal, pTsDataState->pkValLen); } else { if (hasPk) { (*ppCurPkVal) = POINTER_SHIFT(pDataVal, sizeof(TSKEY)); if ((*pCurTs) == lastTs && pTsDataState->comparePkColFn((*ppCurPkVal), pLastPkVal) < 0) { setValueBuff(lastTs, pLastPkVal, lastPkLen, (char*)pDataVal, pTsDataState->pkValLen); } } } } else { setValueBuff(lastTs, pLastPkVal, lastPkLen, pTsDataState->pPkValBuff, pTsDataState->pkValLen); int32_t size = tSimpleHashGetSize(pTsDataState->pTableTsDataMap); if (size < MAX_STATE_MAP_SIZE) { (*pWinCode) = TSDB_CODE_FAILED; code = tSimpleHashPut(pTsDataState->pTableTsDataMap, &tableUid, sizeof(uint64_t), pTsDataState->pPkValBuff, pTsDataState->pkValLen); QUERY_CHECK_CODE(code, lino, _end); } else { (*pWinCode) = streamStateGetParTag_rocksdb(pTsDataState->pState, tableUid, &pTsDataState->pPkValBuff, &pTsDataState->pkValLen); if ((*pWinCode) == TSDB_CODE_SUCCESS) { *pCurTs = *(TSKEY*)pTsDataState->pPkValBuff; if (hasPk) { (*ppCurPkVal) = POINTER_SHIFT(pTsDataState->pPkValBuff, sizeof(TSKEY)); } } int32_t tmpCode = streamStatePutParTag_rocksdb(pTsDataState->pState, tableUid, pTsDataState->pPkValBuff, pTsDataState->pkValLen); } } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t doTsDataCommit(STableTsDataState* pTsDataState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; void* batch = NULL; char* pTempBuf = NULL; batch = streamStateCreateBatch(); QUERY_CHECK_NULL(batch, code, lino, _end, terrno); int idx = streamStateGetCfIdx(pTsDataState->pState, "partag"); int32_t len = (pTsDataState->pkValLen + sizeof(uint64_t) + sizeof(int32_t) + 64) * 2; pTempBuf = taosMemoryCalloc(1, len); QUERY_CHECK_NULL(pTempBuf, code, lino, _end, terrno); void* pIte = NULL; int32_t iter = 0; while ((pIte = tSimpleHashIterate(pTsDataState->pTableTsDataMap, pIte, &iter)) != NULL) { if (streamStateGetBatchSize(batch) >= BATCH_LIMIT) { code = streamStatePutBatch_rocksdb(pTsDataState->pState, batch); streamStateClearBatch(batch); QUERY_CHECK_CODE(code, lino, _end); } if (HAS_TSDATA_FLAG(pIte, pTsDataState->pkValLen)) { void* pKey = tSimpleHashGetKey(pIte, NULL); UNSET_TSDATA_FLAG(pIte, pTsDataState->pkValLen); code = streamStatePutBatchOptimize(pTsDataState->pState, idx, batch, pKey, pIte, pTsDataState->pkValLen, 0, pTempBuf); QUERY_CHECK_CODE(code, lino, _end); memset(pTempBuf, 0, len); qDebug("flush ts data,table id:%" PRIu64 , *(uint64_t*)pKey); } } int32_t numOfElems = streamStateGetBatchSize(batch); if (numOfElems > 0) { code = streamStatePutBatch_rocksdb(pTsDataState->pState, batch); QUERY_CHECK_CODE(code, lino, _end); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } taosMemoryFree(pTempBuf); streamStateDestroyBatch(batch); return code; } int32_t doRangeDataCommit(STableTsDataState* pTsDataState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; void* batch = NULL; batch = streamStateCreateBatch(); QUERY_CHECK_NULL(batch, code, lino, _end, terrno); int idx = streamStateGetCfIdx(pTsDataState->pState, "sess"); int32_t len = (pTsDataState->pkValLen + sizeof(uint64_t) + sizeof(int32_t) + 64) * 2; int32_t size = taosArrayGetSize(pTsDataState->pScanRanges); for (int32_t i = 0; i < size; i++) { SScanRange* pRange = taosArrayGet(pTsDataState->pScanRanges, i); if (streamStateGetBatchSize(batch) >= BATCH_LIMIT) { code = streamStatePutBatch_rocksdb(pTsDataState->pState, batch); streamStateClearBatch(batch); QUERY_CHECK_CODE(code, lino, _end); } SSessionKey key = {.win = pRange->win, .groupId = 0}; int32_t uidSize = tSimpleHashGetSize(pRange->pUIds); int32_t gpIdSize = tSimpleHashGetSize(pRange->pGroupIds); int32_t size = uidSize + gpIdSize; uint64_t* pIdBuf = (uint64_t*)taosMemoryCalloc(1, size); void* pIte = NULL; int32_t iter = 0; int32_t i = 0; while ((pIte = tSimpleHashIterate(pTsDataState->pTableTsDataMap, pIte, &iter)) != NULL) { void* pTempKey = tSimpleHashGetKey(pIte, NULL); pIdBuf[i] = *(uint64_t*)pTempKey; i++; } code = streamStatePutBatchOptimize(pTsDataState->pState, idx, batch, &key, (void*)pIdBuf, size, 0, NULL); QUERY_CHECK_CODE(code, lino, _end); } int32_t numOfElems = streamStateGetBatchSize(batch); if (numOfElems > 0) { code = streamStatePutBatch_rocksdb(pTsDataState->pState, batch); QUERY_CHECK_CODE(code, lino, _end); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } streamStateDestroyBatch(batch); return code; } int32_t initTsDataState(STableTsDataState** ppTsDataState, int8_t pkType, int32_t pkLen, void* pState, void* pOtherState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; STableTsDataState* pTsDataState = taosMemoryCalloc(1, sizeof(STableTsDataState)); QUERY_CHECK_NULL(pTsDataState, code, lino, _end, terrno); _hash_fn_t hashFn = taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT); pTsDataState->pTableTsDataMap = tSimpleHashInit(DEFAULT_STATE_MAP_CAPACITY, hashFn); QUERY_CHECK_NULL(pTsDataState->pTableTsDataMap, code, lino, _end, terrno); pTsDataState->pkValLen = sizeof(TSKEY) + pkLen + sizeof(char); pTsDataState->pPkValBuff = taosMemoryCalloc(1, pTsDataState->pkValLen); QUERY_CHECK_NULL(pTsDataState->pPkValBuff, code, lino, _end, terrno); if (pkLen != 0) { pTsDataState->comparePkColFn = getKeyComparFunc(pkType, TSDB_ORDER_ASC); } else { pTsDataState->comparePkColFn = NULL; } pTsDataState->pScanRanges = taosArrayInit(64, sizeof(SScanRange)); QUERY_CHECK_NULL(pTsDataState->pScanRanges, code, lino, _end, terrno); pTsDataState->pState = pState; pTsDataState->recValueLen = sizeof(SRecDataInfo) + pkLen; pTsDataState->pRecValueBuff = taosMemoryCalloc(1, pTsDataState->recValueLen); QUERY_CHECK_NULL(pTsDataState->pRecValueBuff, code, lino, _end, terrno); pTsDataState->curRecId = -1; pTsDataState->pStreamTaskState = pOtherState; pTsDataState->cfgIndex = streamStateGetCfIdx(pTsDataState->pState, "sess"); pTsDataState->pBatch = streamStateCreateBatch(); QUERY_CHECK_NULL(pTsDataState->pBatch, code, lino, _end, TSDB_CODE_FAILED); pTsDataState->batchBufflen = (pTsDataState->recValueLen + sizeof(uint64_t) + sizeof(int32_t) + 64) * 2; pTsDataState->pBatchBuff = taosMemoryCalloc(1, pTsDataState->batchBufflen); (*ppTsDataState) = pTsDataState; _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } static void destroyScanRange(SScanRange* pRange) { pRange->win.skey = INT64_MIN; pRange->win.ekey = INT64_MIN; tSimpleHashCleanup(pRange->pUIds); pRange->pUIds = NULL; tSimpleHashCleanup(pRange->pGroupIds); pRange->pGroupIds = NULL; } void destroyTsDataState(STableTsDataState* pTsDataState) { SArray* pScanRanges = pTsDataState->pScanRanges; int32_t size = taosArrayGetSize(pScanRanges); for (int32_t i = 0; i < size; i++) { SScanRange* pRange = taosArrayGet(pScanRanges, i); destroyScanRange(pRange); } taosArrayDestroy(pTsDataState->pScanRanges); tSimpleHashCleanup(pTsDataState->pTableTsDataMap); taosMemoryFreeClear(pTsDataState->pPkValBuff); taosMemoryFreeClear(pTsDataState->pState); taosMemoryFreeClear(pTsDataState->pRecValueBuff); pTsDataState->pStreamTaskState = NULL; streamStateClearBatch(pTsDataState->pBatch); streamStateDestroyBatch(pTsDataState->pBatch); pTsDataState->pBatch = NULL; taosMemoryFreeClear(pTsDataState->pBatchBuff); taosMemoryFreeClear(pTsDataState); } int32_t recoverTsData(STableTsDataState* pTsDataState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SStreamStateCur* pCur = createStateCursor(NULL); streamStateParTagSeekKeyNext_rocksdb(pTsDataState->pState, INT64_MIN, pCur); while (1) { uint64_t tableUid = 0; void* pVal = NULL; int32_t len = 0; int32_t winCode = streamStateParTagGetKVByCur_rocksdb(pCur, &tableUid, &pVal, &len); if (winCode != TSDB_CODE_SUCCESS) { break; } if (pTsDataState->pkValLen != len) { taosMemoryFree(pVal); streamStateCurNext_rocksdb(pCur); continue; } UNSET_TSDATA_FLAG(pVal, len); code = tSimpleHashPut(pTsDataState->pTableTsDataMap, &tableUid, sizeof(uint64_t), pVal, len); taosMemoryFree(pVal); QUERY_CHECK_CODE(code, lino, _end); streamStateCurNext_rocksdb(pCur); } _end: streamStateFreeCur(pCur); if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } SStreamStateCur* getLastStateCur(SStreamFileState* pFileState, getStateBuffFn fn) { SStreamStateCur* pCur = createStateCursor(pFileState); if (pCur == NULL) { return NULL; } SSHashObj* pSearchBuff = fn(pFileState); pCur->buffIndex = 0; pCur->hashIter = 0; pCur->pHashData = NULL; pCur->pHashData = tSimpleHashIterate(pSearchBuff, pCur->pHashData, &pCur->hashIter); return pCur; } void moveLastStateCurNext(SStreamStateCur* pCur, getStateBuffFn fn) { SSHashObj* pSearchBuff = fn(pCur->pStreamFileState); pCur->pHashData = tSimpleHashIterate(pSearchBuff, pCur->pHashData, &pCur->hashIter); } int32_t getNLastStateKVByCur(SStreamStateCur* pCur, int32_t num, SArray* pRes) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SArray* pWinStates = NULL; int32_t size = 0; while(1) { if (pCur->pHashData == NULL) { return TSDB_CODE_FAILED; } pWinStates = *((void**)pCur->pHashData); size = taosArrayGetSize(pWinStates); if (size > 0) { break; } moveLastStateCurNext(pCur, getSearchBuff); } int32_t i = TMAX(size - num, 0); for ( ; i < size; i++) { SWinKey* pKey = taosArrayGet(pWinStates, i); int32_t len = 0; void* pVal = NULL; int32_t winCode = TSDB_CODE_SUCCESS; code = addRowBuffIfNotExist(pCur->pStreamFileState, (void*)pKey, sizeof(SWinKey), &pVal, &len, &winCode); QUERY_CHECK_CODE(code, lino, _end); if (winCode != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since window not exist. ts:%" PRId64 ",groupId:%" PRIu64, __func__, __LINE__, pKey->ts, pKey->groupId); } void* pTempRes = taosArrayPush(pRes, &pVal); QUERY_CHECK_NULL(pTempRes, code, lino, _end, terrno); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t reloadTsDataState(STableTsDataState* pTsDataState) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; STableTsDataState tmpState = *pTsDataState; _hash_fn_t hashFn = taosGetDefaultHashFunction(TSDB_DATA_TYPE_UBIGINT); tmpState.pTableTsDataMap = tSimpleHashInit(DEFAULT_STATE_MAP_CAPACITY, hashFn); QUERY_CHECK_NULL(tmpState.pTableTsDataMap, code, lino, _end, terrno); code = recoverTsData(&tmpState); QUERY_CHECK_CODE(code, lino, _end); void* pIte = NULL; int32_t iter = 0; while ((pIte = tSimpleHashIterate(pTsDataState->pTableTsDataMap, pIte, &iter)) != NULL) { size_t keyLen = 0; void* pKey = tSimpleHashGetKey(pIte, &keyLen); code = tSimpleHashPut(tmpState.pTableTsDataMap, pKey, keyLen, pIte, pTsDataState->pkValLen); QUERY_CHECK_CODE(code, lino, _end); } tSimpleHashCleanup(pTsDataState->pTableTsDataMap); pTsDataState->pTableTsDataMap = tmpState.pTableTsDataMap; _end: if (code != TSDB_CODE_SUCCESS) { tSimpleHashCleanup(tmpState.pTableTsDataMap); qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t saveRecInfoToDisk(STableTsDataState* pTsDataState, SSessionKey* pKey, SRecDataInfo* pVal, int32_t vLen) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SStateSessionKey stateKey = {.key = *pKey, .opNum = ((SStreamState*)pTsDataState->pState)->number}; code = streamStatePutBatchOptimize(pTsDataState->pState, pTsDataState->cfgIndex, pTsDataState->pBatch, &stateKey, pVal, vLen, 0, pTsDataState->pBatchBuff); QUERY_CHECK_CODE(code, lino, _end); memset(pTsDataState->pBatchBuff, 0, pTsDataState->batchBufflen); if (streamStateGetBatchSize(pTsDataState->pBatch) >= BATCH_LIMIT) { code = streamStatePutBatch_rocksdb(pTsDataState->pState, pTsDataState->pBatch); streamStateClearBatch(pTsDataState->pBatch); QUERY_CHECK_CODE(code, lino, _end); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t flushRemainRecInfoToDisk(STableTsDataState* pTsDataState) { int32_t code = streamStatePutBatch_rocksdb(pTsDataState->pState, pTsDataState->pBatch); streamStateClearBatch(pTsDataState->pBatch); return code; } int32_t recoverHashSortBuff(SStreamFileState* pFileState, SArray* pWinStates, uint64_t groupId) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SWinKey start = {.groupId = groupId, .ts = INT64_MAX}; void* pState = getStateFileStore(pFileState); SStreamStateCur* pCur = streamStateSeekKeyPrev_rocksdb(pState, &start); for (int32_t i = 0; i < NUM_OF_CACHE_WIN; i++) { SWinKey tmpKey = {.groupId = groupId}; int32_t tmpRes = streamStateGetGroupKVByCur_rocksdb(pState, pCur, &tmpKey, NULL, 0); if (tmpRes != TSDB_CODE_SUCCESS) { break; } void* tmp = taosArrayPush(pWinStates, &tmpKey); QUERY_CHECK_NULL(tmp, code, lino, _end, terrno); streamStateCurPrev_rocksdb(pCur); } taosArraySort(pWinStates, winKeyCmprImpl); streamStateFreeCur(pCur); _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t getRowStateAllPrevRow(SStreamFileState* pFileState, const SWinKey* pKey, SArray* pResArray, int32_t maxNum) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SWinKey* pPrevKey = NULL; SSHashObj* pSearchBuff = getSearchBuff(pFileState); void* pState = getStateFileStore(pFileState); void** ppBuff = (void**)tSimpleHashGet(pSearchBuff, &pKey->groupId, sizeof(uint64_t)); int32_t num = 0; if (ppBuff) { SArray* pWinStates = (SArray*)(*ppBuff); int32_t size = taosArrayGetSize(pWinStates); int32_t index = binarySearch(pWinStates, size, pKey, fillStateKeyCompare); for (; index >= 0 && num <= maxNum; index--) { pPrevKey = taosArrayGet(pWinStates, index); if (winKeyCmprImpl(pPrevKey, pKey) == 0) { continue; } void* pVal = NULL; int32_t len = 0; int32_t winCode = TSDB_CODE_SUCCESS; code = addRowBuffIfNotExist(pFileState, (void*)pPrevKey, sizeof(SWinKey), &pVal, &len, &winCode); QUERY_CHECK_CODE(code, lino, _end); void* tempRes = taosArrayPush(pResArray, &pVal); QUERY_CHECK_NULL(tempRes, code, lino, _end, terrno); num++; } } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } int32_t setStateRecFlag(SStreamFileState* pFileState, const void* pKey, int32_t keyLen, int32_t mode) { return tSimpleHashPut(pFileState->pRecFlagMap, pKey, keyLen, &mode, sizeof(int32_t)); } int32_t getStateRecFlag(SStreamFileState* pFileState, const void* pKey, int32_t keyLen, int32_t* pMode) { void* pVal = tSimpleHashGet(pFileState->pRecFlagMap, pKey, keyLen); if (pVal == NULL) { return TSDB_CODE_FAILED; } *pMode = *(int32_t*) pVal; return TSDB_CODE_SUCCESS; } void clearExpiredSessionState(SStreamFileState* pFileState, int32_t numOfKeep, TSKEY minTs, SSHashObj* pFlushGroup) { int32_t code = TSDB_CODE_SUCCESS; int32_t lino = 0; SSHashObj* pSessionBuff = pFileState->rowStateBuff; SStreamSnapshot* pFlushList = NULL; if (pFlushGroup != NULL) { pFlushList = tdListNew(POINTER_BYTES); } void* pIte = NULL; int32_t iter = 0; while ((pIte = tSimpleHashIterate(pSessionBuff, pIte, &iter)) != NULL) { SArray* pWinStates = *((void**)pIte); int32_t arraySize = TARRAY_SIZE(pWinStates); if (minTs != INT64_MAX && arraySize > numOfKeep) { SSessionKey key = {.win.skey = minTs, .win.ekey = minTs}; key.groupId = *(uint64_t*)tSimpleHashGetKey(pIte, NULL); int32_t index = binarySearch(pWinStates, arraySize, &key, fillStateKeyCompare); numOfKeep = TMAX(arraySize - index, MIN_NUM_OF_SORT_CACHE_WIN); qDebug("modify numOfKeep, numOfKeep:%d. %s at line %d", numOfKeep, __func__, __LINE__); } int32_t size = arraySize - numOfKeep; for (int32_t i = 0; i < size; i++) { SRowBuffPos* pPos = taosArrayGetP(pWinStates, i); SSessionKey* pKey = pPos->pKey; if (tSimpleHashGetSize(pFileState->pRecFlagMap) > 0) { tSimpleHashRemove(pFileState->pRecFlagMap, pKey, sizeof(SSessionKey)); } pPos->invalid = true; if (i == 0 && pFlushGroup != NULL) { void* pGpVal = tSimpleHashGet(pFlushGroup, &pKey->groupId, sizeof(uint64_t)); if (pGpVal == NULL) { code = tdListAppend(pFlushList, &pPos); QUERY_CHECK_CODE(code, lino, _end); code = tSimpleHashPut(pFlushGroup, &pKey->groupId, sizeof(uint64_t), NULL, 0); QUERY_CHECK_CODE(code, lino, _end); continue; } } pPos->beFlushed = true; qTrace("clear expired session buff, ts:%" PRId64 ",groupid:%" PRIu64 ". %s at line %d", pKey->win.skey, pKey->groupId, __func__, __LINE__); if (isFlushedState(pFileState, pKey->win.skey, 0)) { int32_t code_file = pFileState->stateFileRemoveFn(pFileState, pKey); qTrace("clear expired file, ts:%" PRId64 ". %s at line %d res:%d", pKey->win.skey, __func__, __LINE__, code_file); } } taosArrayRemoveBatch(pWinStates, 0, size, NULL); } if (pFlushList != NULL) { flushSnapshot(pFileState, pFlushList, false); code = clearRowBuffNonFlush(pFileState); QUERY_CHECK_CODE(code, lino, _end); tdListFreeP(pFlushList, destroyRowBuffPosPtr); } _end: if (code != TSDB_CODE_SUCCESS) { qError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } }