rocksdb/db/db_impl/db_impl_follower.cc
Xingbo Wang 11a259a5f0 Support GetFileSize API in FSRandomAccessFile (#13676)
Summary:
Add file size validation in ReadFooterFromFile function.
    Deprecate skip_checking_sst_file_sizes_on_db_open option.
    This change is used to address this issue
    https://github.com/facebook/rocksdb/issues/13619
    It supports file size validation in ReadFooterFromFile. In favor of this
    change, CheckConsistency function and
    skip_checking_sst_file_sizes_on_db_open flag are deprecated.

    The CheckConsistency function checks each file size matches what was
    recorded in manifest during DB open. Meantime, ReadFooterFromFile was
    called for each file in LoadTables function. Since ReadFooterFromFile
    always validates file size, the CheckConsistency is redundant.

    In addtion, CheckConsistency is executed in a single thread. This could
    slow down DB open when a network file system is used. Therefore, the
    flag skip_checking_sst_file_sizes_on_db_open was added to skip this
    check. After this change, ReadFooterFromFile was executed in parallel
    through multiple threads. Therefore, the concern of DB open slowness is
    eliminated, and the flag could be deprecated.

    When paranoid check flag is set to true, corrupted file will fail to open the DB.
    When paranoid check flag is set to false, DB will still be able to open, the
    healthy ones can be accessed, while the corrupted ones not.

    There is 2 slight concerns of this change.

    *If max_open_files is set with smaller value, engine will not open all
    the files during DB open. This means if there is a corruption on file
    size, it will not be detected during DB open, but rather at a later
    time. Since the default is -1, which means open all the files, and it is
    rarely overridden and a lot of new features rely on it to be -1, the
    risk is very low.

    *If FIFO compaction is used, engine could fail to open DB unnecessarily
    on the corrupted files that would never be used again. However, this is
    a very rare case as well. The error could still be ignored by setting
    paranoid_checks operationally. The risk is very low.

    To remain backward compatibility. The public facing flag was kept and
    marked as no-op internally. Another change is required to fully remove
    the flag.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/13676

Test Plan:
make check
    A new unit test was added to validate file size check API works as
    expected.

Reviewed By: pdillinger

Differential Revision: D76168033

Pulled By: xingbowang

fbshipit-source-id: 8ceacf39bcfe02ff7aa289868c341366ee9f3a8e
2025-07-09 10:40:28 -07:00

343 lines
12 KiB
C++

// Copyright (c) 2024-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "db/db_impl/db_impl_follower.h"
#include <algorithm>
#include <cinttypes>
#include "db/arena_wrapped_db_iter.h"
#include "db/merge_context.h"
#include "env/composite_env_wrapper.h"
#include "env/fs_on_demand.h"
#include "logging/auto_roll_logger.h"
#include "logging/logging.h"
#include "monitoring/perf_context_imp.h"
#include "rocksdb/configurable.h"
#include "rocksdb/db.h"
#include "util/cast_util.h"
#include "util/write_batch_util.h"
namespace ROCKSDB_NAMESPACE {
DBImplFollower::DBImplFollower(const DBOptions& db_options,
std::unique_ptr<Env>&& env,
const std::string& dbname, std::string src_path)
: DBImplSecondary(db_options, dbname, ""),
env_guard_(std::move(env)),
stop_requested_(false),
src_path_(std::move(src_path)),
cv_(&mu_) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Opening the db in follower mode");
LogFlush(immutable_db_options_.info_log);
}
DBImplFollower::~DBImplFollower() {
Status s = Close();
if (!s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Error closing DB : %s",
s.ToString().c_str());
}
}
// Recover a follower DB instance by reading the MANIFEST. The verification
// as part of the MANIFEST replay will ensure that local links to the
// leader's files are created, thus ensuring we can continue reading them
// even if the leader deletes those files due to compaction.
// TODO:
// 1. Devise a mechanism to prevent misconfiguration by, for example,
// keeping a local copy of the IDENTITY file and cross checking
// 2. Make the recovery more robust by retrying if the first attempt
// fails.
Status DBImplFollower::Recover(
const std::vector<ColumnFamilyDescriptor>& column_families,
bool /*readonly*/, bool /*error_if_wal_file_exists*/,
bool /*error_if_data_exists_in_wals*/, bool /*is_retry*/, uint64_t*,
RecoveryContext* /*recovery_ctx*/, bool* /*can_retry*/) {
mutex_.AssertHeld();
JobContext job_context(0);
Status s;
s = static_cast<ReactiveVersionSet*>(versions_.get())
->Recover(column_families, &manifest_reader_, &manifest_reporter_,
&manifest_reader_status_);
if (!s.ok()) {
if (manifest_reader_status_) {
manifest_reader_status_->PermitUncheckedError();
}
return s;
}
if (s.ok()) {
default_cf_handle_ = new ColumnFamilyHandleImpl(
versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
// Start the periodic catch-up thread
// TODO: See if it makes sense to have a threadpool, rather than a thread
// per follower DB instance
catch_up_thread_.reset(
new port::Thread(&DBImplFollower::PeriodicRefresh, this));
}
return s;
}
// Try to catch up by tailing the MANIFEST.
// TODO:
// 1. Cleanup obsolete files afterward
// 2. Add some error notifications and statistics
Status DBImplFollower::TryCatchUpWithLeader() {
assert(versions_.get() != nullptr);
assert(manifest_reader_.get() != nullptr);
Status s;
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin1");
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin2");
// read the manifest and apply new changes to the follower instance
std::unordered_set<ColumnFamilyData*> cfds_changed;
JobContext job_context(0, true /*create_superversion*/);
{
InstrumentedMutexLock lock_guard(&mutex_);
std::vector<std::string> files_to_delete;
s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
->ReadAndApply(&mutex_, &manifest_reader_,
manifest_reader_status_.get(), &cfds_changed,
&files_to_delete);
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
pending_outputs_inserted_elem_.reset(new std::list<uint64_t>::iterator(
CaptureCurrentFileNumberInPendingOutputs()));
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
static_cast<uint64_t>(versions_->LastSequence()));
ROCKS_LOG_INFO(
immutable_db_options_.info_log, "Next file number is %" PRIu64,
static_cast<uint64_t>(versions_->current_next_file_number()));
for (ColumnFamilyData* cfd : cfds_changed) {
if (cfd->IsDropped()) {
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
cfd->GetName().c_str());
continue;
}
VersionStorageInfo::LevelSummaryStorage tmp;
ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
"[%s] Level summary: %s\n", cfd->GetName().c_str(),
cfd->current()->storage_info()->LevelSummary(&tmp));
}
if (s.ok()) {
for (auto cfd : cfds_changed) {
if (cfd->mem()->GetEarliestSequenceNumber() <
versions_->LastSequence()) {
// Construct a new memtable with earliest sequence number set to the
// last sequence number in the VersionSet. This matters when
// DBImpl::MultiCFSnapshot tries to get consistent references
// to super versions in a lock free manner, it checks the earliest
// sequence number to detect if there was a change in version in
// the meantime.
MemTable* new_mem = cfd->ConstructNewMemtable(
cfd->GetLatestMutableCFOptions(), versions_->LastSequence());
cfd->mem()->SetNextLogNumber(cfd->GetLogNumber());
cfd->mem()->ConstructFragmentedRangeTombstones();
cfd->imm()->Add(cfd->mem(), &job_context.memtables_to_free);
new_mem->Ref();
cfd->SetMemtable(new_mem);
}
// This will check if the old memtable is still referenced
cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
&job_context.memtables_to_free);
auto& sv_context = job_context.superversion_contexts.back();
cfd->InstallSuperVersion(&sv_context, &mutex_);
sv_context.NewSuperVersion();
}
}
for (auto& file : files_to_delete) {
IOStatus io_s = fs_->DeleteFile(file, IOOptions(), nullptr);
if (!io_s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Cannot delete file %s: %s", file.c_str(),
io_s.ToString().c_str());
}
}
}
job_context.Clean();
// Cleanup unused, obsolete files.
JobContext purge_files_job_context(0);
{
InstrumentedMutexLock lock_guard(&mutex_);
// Currently, follower instance does not create any database files, thus
// is unnecessary for the follower to force full scan.
FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
}
if (purge_files_job_context.HaveSomethingToDelete()) {
PurgeObsoleteFiles(purge_files_job_context);
}
purge_files_job_context.Clean();
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:End");
return s;
}
void DBImplFollower::PeriodicRefresh() {
while (!stop_requested_.load()) {
MutexLock l(&mu_);
int64_t wait_until =
immutable_db_options_.clock->NowMicros() +
immutable_db_options_.follower_refresh_catchup_period_ms * 1000;
immutable_db_options_.clock->TimedWait(
&cv_, std::chrono::microseconds(wait_until));
if (stop_requested_.load()) {
break;
}
Status s;
for (uint64_t i = 0;
i < immutable_db_options_.follower_catchup_retry_count &&
!stop_requested_.load();
++i) {
s = TryCatchUpWithLeader();
if (s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Successful catch up on attempt %llu",
static_cast<unsigned long long>(i));
break;
}
wait_until = immutable_db_options_.clock->NowMicros() +
immutable_db_options_.follower_catchup_retry_wait_ms * 1000;
immutable_db_options_.clock->TimedWait(
&cv_, std::chrono::microseconds(wait_until));
}
if (!s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log, "Catch up unsuccessful");
}
}
}
Status DBImplFollower::Close() {
if (catch_up_thread_) {
stop_requested_.store(true);
{
MutexLock l(&mu_);
cv_.SignalAll();
}
catch_up_thread_->join();
catch_up_thread_.reset();
}
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
return DBImpl::Close();
}
Status DB::OpenAsFollower(const Options& options, const std::string& dbname,
const std::string& leader_path,
std::unique_ptr<DB>* dbptr) {
dbptr->reset();
DBOptions db_options(options);
ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families;
column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
std::vector<ColumnFamilyHandle*> handles;
Status s = DB::OpenAsFollower(db_options, dbname, leader_path,
column_families, &handles, dbptr);
if (s.ok()) {
assert(handles.size() == 1);
delete handles[0];
}
return s;
}
Status DB::OpenAsFollower(
const DBOptions& db_options, const std::string& dbname,
const std::string& src_path,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, std::unique_ptr<DB>* dbptr) {
dbptr->reset();
FileSystem* fs = db_options.env->GetFileSystem().get();
{
IOStatus io_s;
if (db_options.create_if_missing) {
io_s = fs->CreateDirIfMissing(dbname, IOOptions(), nullptr);
} else {
io_s = fs->FileExists(dbname, IOOptions(), nullptr);
}
if (!io_s.ok()) {
return static_cast<Status>(io_s);
}
}
std::unique_ptr<Env> new_env(new CompositeEnvWrapper(
db_options.env, NewOnDemandFileSystem(db_options.env->GetFileSystem(),
src_path, dbname)));
DBOptions tmp_opts(db_options);
Status s;
tmp_opts.env = new_env.get();
if (nullptr == tmp_opts.info_log) {
s = CreateLoggerFromOptions(dbname, tmp_opts, &tmp_opts.info_log);
if (!s.ok()) {
tmp_opts.info_log = nullptr;
return s;
}
}
handles->clear();
DBImplFollower* impl =
new DBImplFollower(tmp_opts, std::move(new_env), dbname, src_path);
impl->versions_.reset(new ReactiveVersionSet(
dbname, &impl->immutable_db_options_, impl->file_options_,
impl->table_cache_.get(), impl->write_buffer_manager_,
&impl->write_controller_, impl->io_tracer_));
impl->column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
impl->mutex_.Lock();
s = impl->Recover(column_families, /*read_only=*/true,
/*error_if_wal_file_exists=*/false,
/*error_if_data_exists_in_wals=*/false);
if (s.ok()) {
for (const auto& cf : column_families) {
auto cfd =
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
if (nullptr == cfd) {
s = Status::InvalidArgument("Column family not found", cf.name);
break;
}
handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
}
}
SuperVersionContext sv_context(false /* create_superversion */);
if (s.ok()) {
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
sv_context.NewSuperVersion();
cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
}
}
impl->mutex_.Unlock();
sv_context.Clean();
if (s.ok()) {
dbptr->reset(impl);
for (auto h : *handles) {
impl->NewThreadStatusCfInfo(
static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
}
} else {
for (auto h : *handles) {
delete h;
}
handles->clear();
delete impl;
}
return s;
}
} // namespace ROCKSDB_NAMESPACE