Summary: PointLockManager manages point lock per key. The old implementation partition the per key lock into 16 stripes. Each stripe handles the point lock for a subset of keys. Each stripe have only one conditional variable. This conditional variable is used by all the transactions that are waiting for its turn to acquire a lock of a key that belongs to this stripe. In production, we notified that when there are multiple transactions trying to write to the same key, all of them will wait on the same conditional variables. When the previous lock holder released the key, all of the transactions are woken up, but only one of them could proceed, and the rest goes back to sleep. This wasted a lot of CPU cycles. In addition, when there are other keys being locked/unlocked on the same lock stripe, the problem becomes even worse. In order to solve this issue, we implemented a new PerKeyPointLockManager that keeps a transaction waiter queue at per key level. When a transaction could not acquire a lock immediately, it joins the waiter queue of the key and waits on a dedicated conditional variable. When previous lock holder released the lock, it wakes up the next set of transactions that are eligible to acquire the lock from the waiting queue. The queue respect FIFO order, except it prioritizes lock upgrade/downgrade operation. However, this waiter queue change increases the deadlock detection cost, because the transaction waiting in the queue also needs to be considered during deadlock detection. To resolve this issue, a new deadlock_timeout_us (microseconds) configuration is introduced in transaction option. Essentially, when a transaction is waiting on a lock, it will join the wait queue and wait for the duration configured by deadlock_timeout_us without perform deadlock detection. If the transaction didn't get the lock after the deadlock_timeout_us timeout is reached, it will then perform deadlock detection and wait until lock_timeout is reached. This optimization takes the heuristic where majority of the transaction would be able to get the lock without perform deadlock detection. The deadlock_timeout_us configuration needs to be tuned for different workload, if the likelihood of deadlock is very low, the deadlock_timeout_us could be configured close to a big higher than the average transaction execution time, so that majority of the transaction would be able to acquire the lock without performing deadlock detection. If the likelihood of deadlock is high, deadlock_timeout_us could be configured with lower value, so that deadlock would get detected faster. The new PerKeyPointLockManager is disabled by default. It can be enabled by TransactionDBOptions.use_per_key_point_lock_mgr. The deadlock_timeout_us is only effective when PerKeyPointLockManager is used. When deadlock_timeout_us is set to 0, transaction will perform deadlock detection immediately before wait. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13731 Test Plan: Unit test. Stress unit test that validates deadlock detection and exclusive, shared lock guarantee. A new point_lock_bench binary is created to help perform performance test. Reviewed By: pdillinger Differential Revision: D77353607 Pulled By: xingbowang fbshipit-source-id: 21cf93354f9a367a78c8666596ed14013ac7240b
103 lines
4.1 KiB
C++
103 lines
4.1 KiB
C++
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
//
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "utilities/transactions/lock/point/point_lock_manager_test.h"
|
|
#include "utilities/transactions/lock/point/point_lock_validation_test_runner.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
struct PointLockCorrectnessCheckTestParam {
|
|
bool is_per_key_point_lock_manager;
|
|
uint32_t thread_count;
|
|
uint32_t key_count;
|
|
uint32_t max_num_keys_to_lock_per_txn;
|
|
uint32_t execution_time_sec;
|
|
LockTypeToTest lock_type;
|
|
int64_t lock_timeout_us;
|
|
int64_t lock_expiration_us;
|
|
bool allow_non_deadlock_error;
|
|
// to simulate some useful work
|
|
uint32_t max_sleep_after_lock_acquisition_ms;
|
|
};
|
|
|
|
class PointLockCorrectnessCheckTest
|
|
: public PointLockManagerTest,
|
|
public testing::WithParamInterface<PointLockCorrectnessCheckTestParam> {
|
|
public:
|
|
void SetUp() override {
|
|
init();
|
|
auto const& param = GetParam();
|
|
auto per_key_lock_manager = param.is_per_key_point_lock_manager;
|
|
if (per_key_lock_manager) {
|
|
locker_ = std::make_shared<PerKeyPointLockManager>(
|
|
static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
|
|
} else {
|
|
locker_ = std::make_shared<PointLockManager>(
|
|
static_cast<PessimisticTransactionDB*>(db_), txndb_opt_);
|
|
}
|
|
|
|
txn_opt_.deadlock_detect = true;
|
|
txn_opt_.lock_timeout = param.lock_timeout_us;
|
|
txn_opt_.expiration = param.lock_expiration_us;
|
|
}
|
|
|
|
protected:
|
|
TransactionOptions txn_opt_;
|
|
};
|
|
|
|
TEST_P(PointLockCorrectnessCheckTest, LockCorrectnessValidation) {
|
|
auto const& param = GetParam();
|
|
PointLockValidationTestRunner test_runner(
|
|
env_, txndb_opt_, locker_, db_, txn_opt_, param.thread_count,
|
|
param.key_count, param.max_num_keys_to_lock_per_txn,
|
|
param.execution_time_sec, static_cast<LockTypeToTest>(param.lock_type),
|
|
param.allow_non_deadlock_error,
|
|
param.max_sleep_after_lock_acquisition_ms);
|
|
test_runner.run();
|
|
}
|
|
|
|
constexpr auto X_S_LOCK = LockTypeToTest::EXCLUSIVE_AND_SHARED;
|
|
constexpr auto X_LOCK = LockTypeToTest::EXCLUSIVE_ONLY;
|
|
constexpr auto S_LOCK = LockTypeToTest::SHARED_ONLY;
|
|
|
|
INSTANTIATE_TEST_CASE_P(
|
|
PointLockCorrectnessCheckTestSuite, PointLockCorrectnessCheckTest,
|
|
::testing::ValuesIn(std::vector<PointLockCorrectnessCheckTestParam>{
|
|
// 2 second timeout and no expiration simulates myrocks default
|
|
// configuration
|
|
{true, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
|
|
{false, 16, 16, 8, 10, X_S_LOCK, 2000, -1, true, 0},
|
|
{true, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
|
|
{false, 16, 16, 8, 10, X_LOCK, 2000, -1, true, 0},
|
|
{true, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
|
|
{false, 16, 16, 8, 10, S_LOCK, 2000, -1, true, 0},
|
|
// short timeout and expiration to test lock stealing
|
|
{true, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
|
|
{false, 16, 16, 8, 10, X_S_LOCK, 10, 10, true, 10},
|
|
{true, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
|
|
{false, 16, 16, 8, 10, X_LOCK, 10, 10, true, 10},
|
|
{true, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
|
|
{false, 16, 16, 8, 10, S_LOCK, 10, 10, true, 10},
|
|
// long timeout and expiration to test deadlock detection without
|
|
// timeout
|
|
{true, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
|
|
{false, 16, 16, 8, 10, X_S_LOCK, 100000, 100000, false, 0},
|
|
{true, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
|
|
{false, 16, 16, 8, 10, X_LOCK, 100000, 100000, false, 0},
|
|
{true, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
|
|
{false, 16, 16, 8, 10, S_LOCK, 100000, 100000, false, 0},
|
|
// Low lock contention
|
|
{true, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
|
|
{false, 4, 1024 * 1024, 2, 10, S_LOCK, 100000, 100000, false, 0},
|
|
}));
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
int main(int argc, char** argv) {
|
|
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
|
|
::testing::InitGoogleTest(&argc, argv);
|
|
return RUN_ALL_TESTS();
|
|
}
|