Summary: Add file size validation in ReadFooterFromFile function. Deprecate skip_checking_sst_file_sizes_on_db_open option. This change is used to address this issue https://github.com/facebook/rocksdb/issues/13619 It supports file size validation in ReadFooterFromFile. In favor of this change, CheckConsistency function and skip_checking_sst_file_sizes_on_db_open flag are deprecated. The CheckConsistency function checks each file size matches what was recorded in manifest during DB open. Meantime, ReadFooterFromFile was called for each file in LoadTables function. Since ReadFooterFromFile always validates file size, the CheckConsistency is redundant. In addtion, CheckConsistency is executed in a single thread. This could slow down DB open when a network file system is used. Therefore, the flag skip_checking_sst_file_sizes_on_db_open was added to skip this check. After this change, ReadFooterFromFile was executed in parallel through multiple threads. Therefore, the concern of DB open slowness is eliminated, and the flag could be deprecated. When paranoid check flag is set to true, corrupted file will fail to open the DB. When paranoid check flag is set to false, DB will still be able to open, the healthy ones can be accessed, while the corrupted ones not. There is 2 slight concerns of this change. *If max_open_files is set with smaller value, engine will not open all the files during DB open. This means if there is a corruption on file size, it will not be detected during DB open, but rather at a later time. Since the default is -1, which means open all the files, and it is rarely overridden and a lot of new features rely on it to be -1, the risk is very low. *If FIFO compaction is used, engine could fail to open DB unnecessarily on the corrupted files that would never be used again. However, this is a very rare case as well. The error could still be ignored by setting paranoid_checks operationally. The risk is very low. To remain backward compatibility. The public facing flag was kept and marked as no-op internally. Another change is required to fully remove the flag. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13676 Test Plan: make check A new unit test was added to validate file size check API works as expected. Reviewed By: pdillinger Differential Revision: D76168033 Pulled By: xingbowang fbshipit-source-id: 8ceacf39bcfe02ff7aa289868c341366ee9f3a8e
343 lines
12 KiB
C++
343 lines
12 KiB
C++
// Copyright (c) 2024-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#include "db/db_impl/db_impl_follower.h"
|
|
|
|
#include <algorithm>
|
|
#include <cinttypes>
|
|
|
|
#include "db/arena_wrapped_db_iter.h"
|
|
#include "db/merge_context.h"
|
|
#include "env/composite_env_wrapper.h"
|
|
#include "env/fs_on_demand.h"
|
|
#include "logging/auto_roll_logger.h"
|
|
#include "logging/logging.h"
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "rocksdb/configurable.h"
|
|
#include "rocksdb/db.h"
|
|
#include "util/cast_util.h"
|
|
#include "util/write_batch_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
DBImplFollower::DBImplFollower(const DBOptions& db_options,
|
|
std::unique_ptr<Env>&& env,
|
|
const std::string& dbname, std::string src_path)
|
|
: DBImplSecondary(db_options, dbname, ""),
|
|
env_guard_(std::move(env)),
|
|
stop_requested_(false),
|
|
src_path_(std::move(src_path)),
|
|
cv_(&mu_) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
"Opening the db in follower mode");
|
|
LogFlush(immutable_db_options_.info_log);
|
|
}
|
|
|
|
DBImplFollower::~DBImplFollower() {
|
|
Status s = Close();
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Error closing DB : %s",
|
|
s.ToString().c_str());
|
|
}
|
|
}
|
|
|
|
// Recover a follower DB instance by reading the MANIFEST. The verification
|
|
// as part of the MANIFEST replay will ensure that local links to the
|
|
// leader's files are created, thus ensuring we can continue reading them
|
|
// even if the leader deletes those files due to compaction.
|
|
// TODO:
|
|
// 1. Devise a mechanism to prevent misconfiguration by, for example,
|
|
// keeping a local copy of the IDENTITY file and cross checking
|
|
// 2. Make the recovery more robust by retrying if the first attempt
|
|
// fails.
|
|
Status DBImplFollower::Recover(
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
bool /*readonly*/, bool /*error_if_wal_file_exists*/,
|
|
bool /*error_if_data_exists_in_wals*/, bool /*is_retry*/, uint64_t*,
|
|
RecoveryContext* /*recovery_ctx*/, bool* /*can_retry*/) {
|
|
mutex_.AssertHeld();
|
|
|
|
JobContext job_context(0);
|
|
Status s;
|
|
s = static_cast<ReactiveVersionSet*>(versions_.get())
|
|
->Recover(column_families, &manifest_reader_, &manifest_reporter_,
|
|
&manifest_reader_status_);
|
|
if (!s.ok()) {
|
|
if (manifest_reader_status_) {
|
|
manifest_reader_status_->PermitUncheckedError();
|
|
}
|
|
return s;
|
|
}
|
|
if (s.ok()) {
|
|
default_cf_handle_ = new ColumnFamilyHandleImpl(
|
|
versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
|
|
default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
|
|
|
|
// Start the periodic catch-up thread
|
|
// TODO: See if it makes sense to have a threadpool, rather than a thread
|
|
// per follower DB instance
|
|
catch_up_thread_.reset(
|
|
new port::Thread(&DBImplFollower::PeriodicRefresh, this));
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// Try to catch up by tailing the MANIFEST.
|
|
// TODO:
|
|
// 1. Cleanup obsolete files afterward
|
|
// 2. Add some error notifications and statistics
|
|
Status DBImplFollower::TryCatchUpWithLeader() {
|
|
assert(versions_.get() != nullptr);
|
|
assert(manifest_reader_.get() != nullptr);
|
|
Status s;
|
|
|
|
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin1");
|
|
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin2");
|
|
// read the manifest and apply new changes to the follower instance
|
|
std::unordered_set<ColumnFamilyData*> cfds_changed;
|
|
JobContext job_context(0, true /*create_superversion*/);
|
|
{
|
|
InstrumentedMutexLock lock_guard(&mutex_);
|
|
std::vector<std::string> files_to_delete;
|
|
s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
|
|
->ReadAndApply(&mutex_, &manifest_reader_,
|
|
manifest_reader_status_.get(), &cfds_changed,
|
|
&files_to_delete);
|
|
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
|
|
pending_outputs_inserted_elem_.reset(new std::list<uint64_t>::iterator(
|
|
CaptureCurrentFileNumberInPendingOutputs()));
|
|
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
|
|
static_cast<uint64_t>(versions_->LastSequence()));
|
|
ROCKS_LOG_INFO(
|
|
immutable_db_options_.info_log, "Next file number is %" PRIu64,
|
|
static_cast<uint64_t>(versions_->current_next_file_number()));
|
|
for (ColumnFamilyData* cfd : cfds_changed) {
|
|
if (cfd->IsDropped()) {
|
|
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
|
|
cfd->GetName().c_str());
|
|
continue;
|
|
}
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
|
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
|
|
"[%s] Level summary: %s\n", cfd->GetName().c_str(),
|
|
cfd->current()->storage_info()->LevelSummary(&tmp));
|
|
}
|
|
|
|
if (s.ok()) {
|
|
for (auto cfd : cfds_changed) {
|
|
if (cfd->mem()->GetEarliestSequenceNumber() <
|
|
versions_->LastSequence()) {
|
|
// Construct a new memtable with earliest sequence number set to the
|
|
// last sequence number in the VersionSet. This matters when
|
|
// DBImpl::MultiCFSnapshot tries to get consistent references
|
|
// to super versions in a lock free manner, it checks the earliest
|
|
// sequence number to detect if there was a change in version in
|
|
// the meantime.
|
|
MemTable* new_mem = cfd->ConstructNewMemtable(
|
|
cfd->GetLatestMutableCFOptions(), versions_->LastSequence());
|
|
cfd->mem()->SetNextLogNumber(cfd->GetLogNumber());
|
|
cfd->mem()->ConstructFragmentedRangeTombstones();
|
|
cfd->imm()->Add(cfd->mem(), &job_context.memtables_to_free);
|
|
new_mem->Ref();
|
|
cfd->SetMemtable(new_mem);
|
|
}
|
|
|
|
// This will check if the old memtable is still referenced
|
|
cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
|
|
&job_context.memtables_to_free);
|
|
auto& sv_context = job_context.superversion_contexts.back();
|
|
cfd->InstallSuperVersion(&sv_context, &mutex_);
|
|
sv_context.NewSuperVersion();
|
|
}
|
|
}
|
|
|
|
for (auto& file : files_to_delete) {
|
|
IOStatus io_s = fs_->DeleteFile(file, IOOptions(), nullptr);
|
|
if (!io_s.ok()) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
"Cannot delete file %s: %s", file.c_str(),
|
|
io_s.ToString().c_str());
|
|
}
|
|
}
|
|
}
|
|
job_context.Clean();
|
|
|
|
// Cleanup unused, obsolete files.
|
|
JobContext purge_files_job_context(0);
|
|
{
|
|
InstrumentedMutexLock lock_guard(&mutex_);
|
|
// Currently, follower instance does not create any database files, thus
|
|
// is unnecessary for the follower to force full scan.
|
|
FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
|
|
}
|
|
if (purge_files_job_context.HaveSomethingToDelete()) {
|
|
PurgeObsoleteFiles(purge_files_job_context);
|
|
}
|
|
purge_files_job_context.Clean();
|
|
|
|
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:End");
|
|
|
|
return s;
|
|
}
|
|
|
|
void DBImplFollower::PeriodicRefresh() {
|
|
while (!stop_requested_.load()) {
|
|
MutexLock l(&mu_);
|
|
int64_t wait_until =
|
|
immutable_db_options_.clock->NowMicros() +
|
|
immutable_db_options_.follower_refresh_catchup_period_ms * 1000;
|
|
immutable_db_options_.clock->TimedWait(
|
|
&cv_, std::chrono::microseconds(wait_until));
|
|
if (stop_requested_.load()) {
|
|
break;
|
|
}
|
|
Status s;
|
|
for (uint64_t i = 0;
|
|
i < immutable_db_options_.follower_catchup_retry_count &&
|
|
!stop_requested_.load();
|
|
++i) {
|
|
s = TryCatchUpWithLeader();
|
|
|
|
if (s.ok()) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log,
|
|
"Successful catch up on attempt %llu",
|
|
static_cast<unsigned long long>(i));
|
|
break;
|
|
}
|
|
wait_until = immutable_db_options_.clock->NowMicros() +
|
|
immutable_db_options_.follower_catchup_retry_wait_ms * 1000;
|
|
immutable_db_options_.clock->TimedWait(
|
|
&cv_, std::chrono::microseconds(wait_until));
|
|
}
|
|
if (!s.ok()) {
|
|
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Catch up unsuccessful");
|
|
}
|
|
}
|
|
}
|
|
|
|
Status DBImplFollower::Close() {
|
|
if (catch_up_thread_) {
|
|
stop_requested_.store(true);
|
|
{
|
|
MutexLock l(&mu_);
|
|
cv_.SignalAll();
|
|
}
|
|
catch_up_thread_->join();
|
|
catch_up_thread_.reset();
|
|
}
|
|
|
|
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
|
|
|
|
return DBImpl::Close();
|
|
}
|
|
|
|
Status DB::OpenAsFollower(const Options& options, const std::string& dbname,
|
|
const std::string& leader_path,
|
|
std::unique_ptr<DB>* dbptr) {
|
|
dbptr->reset();
|
|
|
|
DBOptions db_options(options);
|
|
ColumnFamilyOptions cf_options(options);
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
Status s = DB::OpenAsFollower(db_options, dbname, leader_path,
|
|
column_families, &handles, dbptr);
|
|
if (s.ok()) {
|
|
assert(handles.size() == 1);
|
|
delete handles[0];
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status DB::OpenAsFollower(
|
|
const DBOptions& db_options, const std::string& dbname,
|
|
const std::string& src_path,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
|
|
dbptr->reset();
|
|
|
|
FileSystem* fs = db_options.env->GetFileSystem().get();
|
|
{
|
|
IOStatus io_s;
|
|
if (db_options.create_if_missing) {
|
|
io_s = fs->CreateDirIfMissing(dbname, IOOptions(), nullptr);
|
|
} else {
|
|
io_s = fs->FileExists(dbname, IOOptions(), nullptr);
|
|
}
|
|
if (!io_s.ok()) {
|
|
return static_cast<Status>(io_s);
|
|
}
|
|
}
|
|
std::unique_ptr<Env> new_env(new CompositeEnvWrapper(
|
|
db_options.env, NewOnDemandFileSystem(db_options.env->GetFileSystem(),
|
|
src_path, dbname)));
|
|
|
|
DBOptions tmp_opts(db_options);
|
|
Status s;
|
|
tmp_opts.env = new_env.get();
|
|
if (nullptr == tmp_opts.info_log) {
|
|
s = CreateLoggerFromOptions(dbname, tmp_opts, &tmp_opts.info_log);
|
|
if (!s.ok()) {
|
|
tmp_opts.info_log = nullptr;
|
|
return s;
|
|
}
|
|
}
|
|
|
|
handles->clear();
|
|
DBImplFollower* impl =
|
|
new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
|
|
impl->versions_.reset(new ReactiveVersionSet(
|
|
dbname, &impl->immutable_db_options_, impl->file_options_,
|
|
impl->table_cache_.get(), impl->write_buffer_manager_,
|
|
&impl->write_controller_, impl->io_tracer_));
|
|
impl->column_family_memtables_.reset(
|
|
new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
|
|
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
|
|
|
|
impl->mutex_.Lock();
|
|
s = impl->Recover(column_families, /*read_only=*/true,
|
|
/*error_if_wal_file_exists=*/false,
|
|
/*error_if_data_exists_in_wals=*/false);
|
|
if (s.ok()) {
|
|
for (const auto& cf : column_families) {
|
|
auto cfd =
|
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
|
if (nullptr == cfd) {
|
|
s = Status::InvalidArgument("Column family not found", cf.name);
|
|
break;
|
|
}
|
|
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
|
}
|
|
}
|
|
SuperVersionContext sv_context(false /* create_superversion */);
|
|
if (s.ok()) {
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
|
sv_context.NewSuperVersion();
|
|
cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
|
|
}
|
|
}
|
|
impl->mutex_.Unlock();
|
|
sv_context.Clean();
|
|
if (s.ok()) {
|
|
dbptr->reset(impl);
|
|
for (auto h : *handles) {
|
|
impl->NewThreadStatusCfInfo(
|
|
static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
|
|
}
|
|
} else {
|
|
for (auto h : *handles) {
|
|
delete h;
|
|
}
|
|
handles->clear();
|
|
delete impl;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|