Summary: Add automatic per-block interpolation search selection (`kAuto` mode) for index blocks. During SST construction, each index block's key distribution is analyzed using the coefficient of variation (CV) of gaps between restart-point keys. Blocks with uniformly distributed keys are flagged via a new bit in the data block footer, and at read time, `kAuto` resolves to interpolation search for uniform blocks and binary search otherwise. ## Key changes - **New `BlockSearchType::kAuto` enum value**: Resolves per-block at read time to either `kInterpolation` or `kBinary` based on the block's uniformity flag. Falls back to `kBinary` on older versions that don't recognize it. - **Write-path uniformity analysis**: `BlockBuilder::ScanForUniformity()` uses Welford's online algorithm to incrementally compute the CV of key gaps at restart points. The result is stored in a new bit (bit 30) of the data block footer's packed restart count. - **New table option `uniform_cv_threshold`** (default: -1 `disabled`): Controls how strict the uniformity check is. Set to negative to disable. Exposed in C++, Java (JNI), and `db_bench`. - **Code reorganization**: Block entry decode helpers (`DecodeEntry`, `DecodeKey`, `DecodeKeyV4`, `ReadBe64FromKey`) moved from `block.cc` to a new shared header `block_util.h` so they can be reused by `BlockBuilder` on the write path. - **New histogram `BLOCK_KEY_DISTRIBUTION_CV`**: Records the CV (scaled by 10000) of each index block's key distribution for observability. - **Java bindings**: `IndexSearchType.kAuto`, `uniformCvThreshold` getter/setter, JNI portal constructor signature updated, and `HistogramType.BLOCK_KEY_DISTRIBUTION_CV` added. Pull Request resolved: https://github.com/facebook/rocksdb/pull/14383 Test Plan: - `IndexBlockTest.IndexValueEncodingTest` parameterized to include `kAuto` search type alongside `kBinary` and `kInterpolation`, verifying correct seek/iteration behavior across all combinations of key distributions, restart intervals, and key lengths. - Uniformity detection validated: blocks with uniform key distribution correctly set `is_uniform = true`, blocks with clustered/non-uniform keys set `is_uniform = false`. - Stress test coverage - Updated check_format_compatible to also include a "uniform" dataset. By default using uniform_cv_threshold=-1 does not result in an incompatibility issues. When manually changing the threshold (e.g. `uniform_cv_threshold=1000`), I see `bad block contents`, which is expected ## Benchmark readrandom with `fillrandom,compact -seed=1 --statistics`: | Benchmark | Branch | Params | avg ops/s | % change vs main | CV P50 | |-----------|--------|--------|-----------|------------------|--------| | readrandom | main | `binary_search, shortening=1` | 335,791 | baseline | N/A | | readrandom | feature | `binary_search, shortening=1` (default) | 335,749 | -0.0% | 1,500 | | readrandom | feature | `auto_search, shortening=1` (kAuto) | 366,832 | **+9.2%** | 1,500 | | readrandom | feature | `interpolation_search, shortening=1` | 366,598 | **+9.2%** | 1,500 | | readrandom | feature | `auto_search, shortening=2` (kAuto) | 344,631 | **+2.6%** | 1,030,000 | | readrandom | feature | `interpolation_search, shortening=2` | 201,178 | **-40.1%** | 1,030,000 | As seen with shortening=2, a non-uniform distribution produces a high CV, which does not use interpolation search. ## Write benchmark There is a write overhead which scans each restart entry for a block upon Finish. In practice this is very low because currently it is only applied to index blocks. See cpu profile (https://fburl.com/strobelight/io5hwj9h) here of `-benchmarks=fillseq,compact -compression_type=none -disable_wal=1`. Only 0.08% attributed to `ScanForUniformity`. Reviewed By: pdillinger Differential Revision: D94738890 Pulled By: joshkang97 fbshipit-source-id: 9661ac593c5fef89d49f3a8a027f1338a0c96766
1617 lines
58 KiB
C++
1617 lines
58 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
//
|
|
// Decodes the blocks generated by block_builder.cc.
|
|
|
|
#include "table/block_based/block.h"
|
|
|
|
#include <algorithm>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "monitoring/perf_context_imp.h"
|
|
#include "port/port.h"
|
|
#include "port/stack_trace.h"
|
|
#include "rocksdb/comparator.h"
|
|
#include "table/block_based/block_prefix_index.h"
|
|
#include "table/block_based/block_util.h"
|
|
#include "table/block_based/data_block_footer.h"
|
|
#include "table/format.h"
|
|
#include "util/coding.h"
|
|
#include "util/math.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
void DataBlockIter::NextImpl() {
|
|
#ifndef NDEBUG
|
|
if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) {
|
|
return;
|
|
}
|
|
#endif
|
|
bool is_shared = false;
|
|
ParseNextDataKey(&is_shared);
|
|
}
|
|
|
|
void MetaBlockIter::NextImpl() {
|
|
bool is_shared = false;
|
|
ParseNextKey<DecodeEntry, true>(&is_shared);
|
|
}
|
|
|
|
void IndexBlockIter::NextImpl() { ParseNextIndexKey(); }
|
|
|
|
void IndexBlockIter::PrevImpl() {
|
|
assert(Valid());
|
|
// Scan backwards to a restart point before current_
|
|
const uint32_t original = current_;
|
|
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
if (restart_index_ == 0) {
|
|
// No more entries
|
|
current_ = GetKeysEndOffset();
|
|
restart_index_ = num_restarts_;
|
|
return;
|
|
}
|
|
restart_index_--;
|
|
}
|
|
SeekToRestartPoint(restart_index_);
|
|
// Loop until end of current entry hits the start of original entry
|
|
while (ParseNextIndexKey() && NextEntryOffset() < original) {
|
|
}
|
|
cur_entry_idx_ = prev_entry_idx;
|
|
}
|
|
|
|
void MetaBlockIter::PrevImpl() {
|
|
assert(Valid());
|
|
// Scan backwards to a restart point before current_
|
|
const uint32_t original = current_;
|
|
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
if (restart_index_ == 0) {
|
|
// No more entries
|
|
current_ = GetKeysEndOffset();
|
|
restart_index_ = num_restarts_;
|
|
return;
|
|
}
|
|
restart_index_--;
|
|
}
|
|
SeekToRestartPoint(restart_index_);
|
|
bool is_shared = false;
|
|
// Loop until end of current entry hits the start of original entry
|
|
while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
|
|
NextEntryOffset() < original) {
|
|
}
|
|
cur_entry_idx_ = prev_entry_idx;
|
|
}
|
|
|
|
// Similar to IndexBlockIter::PrevImpl but also caches the prev entries
|
|
void DataBlockIter::PrevImpl() {
|
|
assert(Valid());
|
|
|
|
const auto prev_entry_idx = cur_entry_idx_ - 1;
|
|
assert(prev_entries_idx_ == -1 ||
|
|
static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
|
|
// Check if we can use cached prev_entries_
|
|
if (prev_entries_idx_ > 0 &&
|
|
prev_entries_[prev_entries_idx_].offset == current_) {
|
|
// Read cached CachedPrevEntry
|
|
prev_entries_idx_--;
|
|
const CachedPrevEntry& current_prev_entry =
|
|
prev_entries_[prev_entries_idx_];
|
|
|
|
const char* key_ptr = nullptr;
|
|
bool raw_key_cached;
|
|
if (current_prev_entry.key_ptr != nullptr) {
|
|
// The key is not delta encoded and stored in the data block
|
|
key_ptr = current_prev_entry.key_ptr;
|
|
raw_key_cached = false;
|
|
} else {
|
|
// The key is delta encoded and stored in prev_entries_keys_buff_
|
|
key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
|
|
raw_key_cached = true;
|
|
}
|
|
const Slice current_key(key_ptr, current_prev_entry.key_size);
|
|
|
|
current_ = current_prev_entry.offset;
|
|
// TODO(ajkr): the copy when `raw_key_cached` is done here for convenience,
|
|
// not necessity. It is convenient since this class treats keys as pinned
|
|
// when `raw_key_` points to an outside buffer. So we cannot allow
|
|
// `raw_key_` point into Prev cache as it is a transient outside buffer
|
|
// (i.e., keys in it are not actually pinned).
|
|
raw_key_.SetKey(current_key, raw_key_cached /* copy */);
|
|
value_ = current_prev_entry.value;
|
|
// Set entry_ using stored entry_size for NextEntryOffset() to work
|
|
entry_ = Slice(data_ + current_, current_prev_entry.entry_size);
|
|
cur_entry_idx_ = prev_entry_idx;
|
|
return;
|
|
}
|
|
|
|
// Clear prev entries cache
|
|
prev_entries_idx_ = -1;
|
|
prev_entries_.clear();
|
|
prev_entries_keys_buff_.clear();
|
|
|
|
// Scan backwards to a restart point before current_
|
|
const uint32_t original = current_;
|
|
while (GetRestartPoint(restart_index_) >= original) {
|
|
if (restart_index_ == 0) {
|
|
// No more entries
|
|
current_ = GetKeysEndOffset();
|
|
restart_index_ = num_restarts_;
|
|
cur_entry_idx_ = prev_entry_idx;
|
|
return;
|
|
}
|
|
restart_index_--;
|
|
}
|
|
|
|
SeekToRestartPoint(restart_index_);
|
|
do {
|
|
bool is_shared = false;
|
|
if (!ParseNextDataKey(&is_shared)) {
|
|
break;
|
|
}
|
|
Slice current_key = raw_key_.GetKey();
|
|
|
|
if (raw_key_.IsKeyPinned()) {
|
|
// The key is not delta encoded
|
|
prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
|
|
current_key.data(), 0, current_key.size(),
|
|
value());
|
|
} else {
|
|
// The key is delta encoded, cache decoded key in buffer
|
|
size_t new_key_offset = prev_entries_keys_buff_.size();
|
|
prev_entries_keys_buff_.append(current_key.data(), current_key.size());
|
|
|
|
prev_entries_.emplace_back(current_, static_cast<uint32_t>(entry_.size()),
|
|
nullptr, new_key_offset, current_key.size(),
|
|
value());
|
|
}
|
|
// Loop until end of current entry hits the start of original entry
|
|
} while (NextEntryOffset() < original);
|
|
prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
|
|
cur_entry_idx_ = prev_entry_idx;
|
|
}
|
|
|
|
void DataBlockIter::SeekImpl(const Slice& target) {
|
|
Slice seek_key = target;
|
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
uint32_t index = 0;
|
|
bool skip_linear_scan = false;
|
|
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
|
|
if (!ok) {
|
|
return;
|
|
}
|
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
}
|
|
|
|
void MetaBlockIter::SeekImpl(const Slice& target) {
|
|
Slice seek_key = target;
|
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
uint32_t index = 0;
|
|
bool skip_linear_scan = false;
|
|
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
|
|
if (!ok) {
|
|
return;
|
|
}
|
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
}
|
|
|
|
// Optimized Seek for point lookup for an internal key `target`
|
|
// target = "seek_user_key @ type | seqno".
|
|
//
|
|
// For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion,
|
|
// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
|
|
// kTypeMerge, this function behaves identically to Seek().
|
|
//
|
|
// For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion,
|
|
// kTypeBlobIndex, kTypeWideColumnEntity, kTypeValuePreferredSeqno or
|
|
// kTypeMerge:
|
|
//
|
|
// If the return value is FALSE, iter location is undefined, and it means:
|
|
// 1) there is no key in this block falling into the range:
|
|
// ["seek_user_key @ type | seqno", "seek_user_key @ kTypeDeletion | 0"],
|
|
// inclusive; AND
|
|
// 2) the last key of this block has a greater user_key from seek_user_key
|
|
//
|
|
// If the return value is TRUE, iter location has two possibilities:
|
|
// 1) If iter is valid, it is set to a location as if set by SeekImpl(target).
|
|
// In this case, it points to the first key with a larger user_key or a
|
|
// matching user_key with a seqno no greater than the seeking seqno.
|
|
// 2) If the iter is invalid, it means that either all the user_key is less
|
|
// than the seek_user_key, or the block ends with a matching user_key but
|
|
// with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno
|
|
// but larger type).
|
|
bool DataBlockIter::SeekForGetImpl(const Slice& target) {
|
|
Slice target_user_key = ExtractUserKey(target);
|
|
uint32_t map_offset = restarts_ + num_restarts_ * sizeof(uint32_t);
|
|
uint8_t entry =
|
|
data_block_hash_index_->Lookup(data_, map_offset, target_user_key);
|
|
|
|
if (entry == kCollision) {
|
|
// HashSeek not effective, falling back
|
|
SeekImpl(target);
|
|
return true;
|
|
}
|
|
|
|
if (entry == kNoEntry) {
|
|
// Even if we cannot find the user_key in this block, the result may
|
|
// exist in the next block. Consider this example:
|
|
//
|
|
// Block N: [aab@100, ... , app@120]
|
|
// boundary key: axy@50 (we make minimal assumption about a boundary key)
|
|
// Block N+1: [axy@10, ... ]
|
|
//
|
|
// If seek_key = axy@60, the search will start from Block N.
|
|
// Even if the user_key is not found in the hash map, the caller still
|
|
// have to continue searching the next block.
|
|
//
|
|
// In this case, we pretend the key is in the last restart interval.
|
|
// The while-loop below will search the last restart interval for the
|
|
// key. It will stop at the first key that is larger than the seek_key,
|
|
// or to the end of the block if no one is larger.
|
|
entry = static_cast<uint8_t>(num_restarts_ - 1);
|
|
}
|
|
|
|
uint32_t restart_index = entry;
|
|
|
|
// check if the key is in the restart_interval
|
|
assert(restart_index < num_restarts_);
|
|
SeekToRestartPoint(restart_index);
|
|
current_ = GetRestartPoint(restart_index);
|
|
|
|
uint32_t limit = GetKeysEndOffset();
|
|
if (restart_index + 1 < num_restarts_) {
|
|
limit = GetRestartPoint(restart_index + 1);
|
|
}
|
|
while (current_ < limit) {
|
|
bool shared;
|
|
// Here we only linear seek the target key inside the restart interval.
|
|
// If a key does not exist inside a restart interval, we avoid
|
|
// further searching the block content across restart interval boundary.
|
|
//
|
|
// TODO(fwu): check the left and right boundary of the restart interval
|
|
// to avoid linear seek a target key that is out of range.
|
|
if (!ParseNextDataKey(&shared) || CompareCurrentKey(target) >= 0) {
|
|
// we stop at the first potential matching user key.
|
|
break;
|
|
}
|
|
// If the loop exits due to CompareCurrentKey(target) >= 0, then current key
|
|
// exists, and its checksum verification will be done in UpdateKey() called
|
|
// in SeekForGet().
|
|
// TODO(cbi): If this loop exits with current_ == restart_, per key-value
|
|
// checksum will not be verified in UpdateKey() since Valid()
|
|
// will return false.
|
|
}
|
|
|
|
if (current_ == restarts_) {
|
|
// Search reaches to the end of the block. There are three possibilities:
|
|
// 1) there is only one user_key match in the block (otherwise collision).
|
|
// the matching user_key resides in the last restart interval, and it
|
|
// is the last key of the restart interval and of the block as well.
|
|
// ParseNextKey() skipped it as its [ type | seqno ] is smaller.
|
|
//
|
|
// 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry,
|
|
// AND all existing user_keys in the restart interval are smaller than
|
|
// seek_user_key.
|
|
//
|
|
// 3) The seek_key is a false positive and happens to be hashed to the
|
|
// last restart interval, AND all existing user_keys in the restart
|
|
// interval are smaller than seek_user_key.
|
|
//
|
|
// The result may exist in the next block each case, so we return true.
|
|
return true;
|
|
}
|
|
|
|
if (icmp_.user_comparator()->Compare(raw_key_.GetUserKey(),
|
|
target_user_key) != 0) {
|
|
// the key is not in this block and cannot be at the next block either.
|
|
return false;
|
|
}
|
|
|
|
// Here we are conservative and only support a limited set of cases
|
|
ValueType value_type = ExtractValueType(raw_key_.GetInternalKey());
|
|
if (value_type != ValueType::kTypeValue &&
|
|
value_type != ValueType::kTypeDeletion &&
|
|
value_type != ValueType::kTypeMerge &&
|
|
value_type != ValueType::kTypeSingleDeletion &&
|
|
value_type != ValueType::kTypeBlobIndex &&
|
|
value_type != ValueType::kTypeWideColumnEntity &&
|
|
value_type != ValueType::kTypeValuePreferredSeqno) {
|
|
SeekImpl(target);
|
|
}
|
|
|
|
// Result found, and the iter is correctly set.
|
|
return true;
|
|
}
|
|
|
|
void IndexBlockIter::SeekImpl(const Slice& target) {
|
|
#ifndef NDEBUG
|
|
if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) {
|
|
return;
|
|
}
|
|
#endif
|
|
TEST_SYNC_POINT("IndexBlockIter::Seek:0");
|
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
Slice seek_key = target;
|
|
if (raw_key_.IsUserKey()) {
|
|
seek_key = ExtractUserKey(target);
|
|
}
|
|
status_ = Status::OK();
|
|
uint32_t index = 0;
|
|
bool skip_linear_scan = false;
|
|
bool ok = false;
|
|
if (prefix_index_) {
|
|
bool prefix_may_exist = true;
|
|
ok = PrefixSeek(target, &index, &prefix_may_exist);
|
|
if (!prefix_may_exist) {
|
|
// This is to let the caller to distinguish between non-existing prefix,
|
|
// and when key is larger than the last key, which both set Valid() to
|
|
// false.
|
|
current_ = GetKeysEndOffset();
|
|
status_ = Status::NotFound();
|
|
}
|
|
// restart interval must be one when hash search is enabled so the binary
|
|
// search simply lands at the right place.
|
|
skip_linear_scan = true;
|
|
} else {
|
|
if (value_delta_encoded_) {
|
|
ok = FindRestartPointForSeek<DecodeKeyV4>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
} else {
|
|
ok = FindRestartPointForSeek<DecodeKey>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
}
|
|
}
|
|
|
|
if (!ok) {
|
|
return;
|
|
}
|
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
}
|
|
|
|
template <typename DecodeKeyFunc>
|
|
bool IndexBlockIter::FindRestartPointForSeek(const Slice& seek_key,
|
|
uint32_t* index,
|
|
bool* skip_linear_scan) {
|
|
if (index_search_type_ == BlockBasedTableOptions::kBinary) {
|
|
return BinarySeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
|
|
skip_linear_scan);
|
|
}
|
|
return InterpolationSeekRestartPointIndex<DecodeKeyFunc>(seek_key, index,
|
|
skip_linear_scan);
|
|
}
|
|
|
|
void DataBlockIter::SeekForPrevImpl(const Slice& target) {
|
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
Slice seek_key = target;
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
uint32_t index = 0;
|
|
bool skip_linear_scan = false;
|
|
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
|
|
if (!ok) {
|
|
return;
|
|
}
|
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
if (!Valid()) {
|
|
if (status_.ok()) {
|
|
SeekToLastImpl();
|
|
}
|
|
} else {
|
|
while (Valid() && CompareCurrentKey(seek_key) > 0) {
|
|
PrevImpl();
|
|
}
|
|
}
|
|
}
|
|
|
|
void MetaBlockIter::SeekForPrevImpl(const Slice& target) {
|
|
PERF_TIMER_GUARD(block_seek_nanos);
|
|
Slice seek_key = target;
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
uint32_t index = 0;
|
|
bool skip_linear_scan = false;
|
|
bool ok = BinarySeekRestartPointIndex<DecodeKey>(seek_key, &index,
|
|
&skip_linear_scan);
|
|
|
|
if (!ok) {
|
|
return;
|
|
}
|
|
FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan);
|
|
|
|
if (!Valid()) {
|
|
if (status_.ok()) {
|
|
SeekToLastImpl();
|
|
}
|
|
} else {
|
|
while (Valid() && CompareCurrentKey(seek_key) > 0) {
|
|
PrevImpl();
|
|
}
|
|
}
|
|
}
|
|
|
|
void DataBlockIter::SeekToFirstImpl() {
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
SeekToRestartPoint(0);
|
|
bool is_shared = false;
|
|
ParseNextDataKey(&is_shared);
|
|
}
|
|
|
|
void MetaBlockIter::SeekToFirstImpl() {
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
SeekToRestartPoint(0);
|
|
bool is_shared = false;
|
|
ParseNextKey<DecodeEntry, true>(&is_shared);
|
|
}
|
|
|
|
void IndexBlockIter::SeekToFirstImpl() {
|
|
#ifndef NDEBUG
|
|
if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) {
|
|
return;
|
|
}
|
|
#endif
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
status_ = Status::OK();
|
|
SeekToRestartPoint(0);
|
|
ParseNextIndexKey();
|
|
}
|
|
|
|
void DataBlockIter::SeekToLastImpl() {
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
bool is_shared = false;
|
|
|
|
while (ParseNextDataKey(&is_shared) &&
|
|
NextEntryOffset() < GetKeysEndOffset()) {
|
|
// Keep skipping
|
|
}
|
|
}
|
|
|
|
void MetaBlockIter::SeekToLastImpl() {
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
bool is_shared = false;
|
|
assert(num_restarts_ >= 1);
|
|
while (ParseNextKey<DecodeEntry, true>(&is_shared) &&
|
|
NextEntryOffset() < GetKeysEndOffset()) {
|
|
// Will probably never reach here since restart_interval is always 1
|
|
}
|
|
}
|
|
|
|
void IndexBlockIter::SeekToLastImpl() {
|
|
if (data_ == nullptr) { // Not init yet
|
|
return;
|
|
}
|
|
status_ = Status::OK();
|
|
SeekToRestartPoint(num_restarts_ - 1);
|
|
while (ParseNextIndexKey() && NextEntryOffset() < GetKeysEndOffset()) {
|
|
}
|
|
}
|
|
|
|
template <class TValue>
|
|
template <typename DecodeEntryFunc, bool StrictCheck>
|
|
bool BlockIter<TValue>::ParseNextKey(bool* is_shared) {
|
|
current_ = NextEntryOffset();
|
|
++cur_entry_idx_;
|
|
const char* p = data_ + current_;
|
|
const char* key_limit = data_ + GetKeysEndOffset();
|
|
|
|
if (p >= key_limit) {
|
|
// No more entries to return. Mark as invalid.
|
|
current_ = GetKeysEndOffset();
|
|
restart_index_ = num_restarts_;
|
|
return false;
|
|
}
|
|
|
|
// Decode next entry
|
|
uint32_t shared, non_shared, value_length;
|
|
uint32_t value_offset = 0;
|
|
|
|
assert(cur_entry_idx_ >= 0);
|
|
assert(values_section_ == nullptr || block_restart_interval_ > 0);
|
|
bool value_offset_encoded =
|
|
values_section_ && cur_entry_idx_ % block_restart_interval_ == 0;
|
|
|
|
auto p_old = p;
|
|
p = DecodeEntryFunc()(p, key_limit, &shared, &non_shared, &value_length,
|
|
value_offset_encoded ? &value_offset : nullptr);
|
|
|
|
if (p == nullptr || raw_key_.Size() < shared) {
|
|
CorruptionError();
|
|
return false;
|
|
} else {
|
|
if constexpr (StrictCheck) {
|
|
auto entry_length =
|
|
non_shared + (values_section_ == nullptr ? value_length : 0);
|
|
if (static_cast<uint32_t>(key_limit - p) < entry_length) {
|
|
CorruptionError();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
assert(values_section_ == nullptr ||
|
|
cur_entry_idx_ % block_restart_interval_ != 0 || shared == 0);
|
|
entry_ = Slice(p_old, p - p_old + non_shared);
|
|
if (shared == 0) {
|
|
*is_shared = false;
|
|
// If this key doesn't share any bytes with prev key, and no min timestamp
|
|
// needs to be padded to the key, then we don't need to decode it and
|
|
// can use its address in the block directly (no copy).
|
|
UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared));
|
|
} else {
|
|
// This key share `shared` bytes with prev key, we need to decode it
|
|
*is_shared = true;
|
|
// If user-defined timestamp is stripped from user key before keys are
|
|
// delta encoded, the decoded key consisting of the shared and non shared
|
|
// bytes do not have user-defined timestamp yet. We need to pad min
|
|
// timestamp to it.
|
|
if (pad_min_timestamp_) {
|
|
raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_);
|
|
} else {
|
|
raw_key_.TrimAppend(shared, p, non_shared);
|
|
}
|
|
}
|
|
|
|
if (shared == 0) {
|
|
while (restart_index_ + 1 < num_restarts_ &&
|
|
GetRestartPoint(restart_index_ + 1) < current_) {
|
|
++restart_index_;
|
|
}
|
|
}
|
|
|
|
if (values_section_) {
|
|
if (value_offset_encoded) {
|
|
// Restart point, derive from offset
|
|
value_ = Slice(values_section_ + value_offset, value_length);
|
|
} else {
|
|
// Non-restart point, derive from previous value
|
|
assert(value_.data() >= values_section_);
|
|
value_ = Slice(value_.data() + value_.size(), value_length);
|
|
}
|
|
|
|
if constexpr (StrictCheck) {
|
|
if ((value_.data() + value_.size()) > data_ + restarts_) {
|
|
CorruptionError();
|
|
return false;
|
|
}
|
|
}
|
|
} else {
|
|
value_ = Slice(entry_.data() + entry_.size(), value_length);
|
|
// extend entry slice to contain value as well
|
|
entry_ = Slice(entry_.data(), entry_.size() + value_.size());
|
|
}
|
|
assert((value_.data() + value_.size()) <= data_ + restarts_);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool DataBlockIter::ParseNextDataKey(bool* is_shared) {
|
|
if (ParseNextKey<DecodeEntry>(is_shared)) {
|
|
#ifndef NDEBUG
|
|
if (global_seqno_ != kDisableGlobalSequenceNumber) {
|
|
// If we are reading a file with a global sequence number we should
|
|
// expect that all encoded sequence numbers are zeros and any value
|
|
// type is kTypeValue, kTypeMerge, kTypeDeletion,
|
|
// kTypeDeletionWithTimestamp, kTypeRangeDeletion, or
|
|
// kTypeWideColumnEntity.
|
|
uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey());
|
|
SequenceNumber seqno;
|
|
ValueType value_type;
|
|
UnPackSequenceAndType(packed, &seqno, &value_type);
|
|
assert(value_type == ValueType::kTypeValue ||
|
|
value_type == ValueType::kTypeMerge ||
|
|
value_type == ValueType::kTypeDeletion ||
|
|
value_type == ValueType::kTypeDeletionWithTimestamp ||
|
|
value_type == ValueType::kTypeRangeDeletion ||
|
|
value_type == ValueType::kTypeWideColumnEntity);
|
|
assert(seqno == 0);
|
|
}
|
|
#endif // NDEBUG
|
|
return true;
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IndexBlockIter::ParseNextIndexKey() {
|
|
bool is_shared = false;
|
|
bool ok = (value_delta_encoded_) ? ParseNextKey<DecodeEntryV4>(&is_shared)
|
|
: ParseNextKey<DecodeEntry>(&is_shared);
|
|
if (ok) {
|
|
if (value_delta_encoded_ || global_seqno_state_ != nullptr ||
|
|
pad_min_timestamp_) {
|
|
DecodeCurrentValue(is_shared);
|
|
}
|
|
}
|
|
return ok;
|
|
}
|
|
|
|
// The format:
|
|
// restart_point 0: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
|
|
// restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
|
|
// ...
|
|
// restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz)
|
|
// where, k is key, v is value, and its encoding is in parentheses.
|
|
// The format of each key is (shared_size, non_shared_size, shared, non_shared)
|
|
// The format of each value, i.e., block handle, is (offset, size) whenever the
|
|
// is_shared is false, which included the first entry in each restart point.
|
|
// Otherwise, the format is delta-size = the size of current block - the size o
|
|
// last block.
|
|
void IndexBlockIter::DecodeCurrentValue(bool is_shared) {
|
|
Slice v(value_.data(), data_ + restarts_ - value_.data());
|
|
// Delta encoding is used if `shared` != 0.
|
|
assert(!value_delta_encoded_ || value_.size() == 0);
|
|
Status decode_s __attribute__((__unused__)) = decoded_value_.DecodeFrom(
|
|
&v, have_first_key_,
|
|
(value_delta_encoded_ && is_shared) ? &decoded_value_.handle : nullptr);
|
|
assert(decode_s.ok());
|
|
value_ = Slice(value_.data(), v.data() - value_.data());
|
|
if (!values_section_ && value_delta_encoded_) {
|
|
assert(entry_.data() + entry_.size() == value_.data());
|
|
// values are inlined in the entry, so need to set next offset accordingly
|
|
entry_ = Slice(entry_.data(), entry_.size() + value_.size());
|
|
}
|
|
|
|
if (global_seqno_state_ != nullptr) {
|
|
// Overwrite sequence number the same way as in DataBlockIter.
|
|
|
|
IterKey& first_internal_key = global_seqno_state_->first_internal_key;
|
|
first_internal_key.SetInternalKey(decoded_value_.first_internal_key,
|
|
/* copy */ true);
|
|
|
|
assert(GetInternalKeySeqno(first_internal_key.GetInternalKey()) == 0);
|
|
|
|
ValueType value_type = ExtractValueType(first_internal_key.GetKey());
|
|
assert(value_type == ValueType::kTypeValue ||
|
|
value_type == ValueType::kTypeMerge ||
|
|
value_type == ValueType::kTypeDeletion ||
|
|
value_type == ValueType::kTypeRangeDeletion ||
|
|
value_type == ValueType::kTypeWideColumnEntity);
|
|
|
|
first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno,
|
|
value_type);
|
|
decoded_value_.first_internal_key = first_internal_key.GetKey();
|
|
}
|
|
if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) {
|
|
first_internal_key_with_ts_.clear();
|
|
PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_,
|
|
decoded_value_.first_internal_key, ts_sz_);
|
|
decoded_value_.first_internal_key = first_internal_key_with_ts_;
|
|
}
|
|
}
|
|
|
|
template <class TValue>
|
|
void BlockIter<TValue>::FindKeyAfterBinarySeek(const Slice& target,
|
|
uint32_t index,
|
|
bool skip_linear_scan) {
|
|
// SeekToRestartPoint() only does the lookup in the restart block. We need
|
|
// to follow it up with NextImpl() to position the iterator at the restart
|
|
// key.
|
|
SeekToRestartPoint(index);
|
|
NextImpl();
|
|
assert(cur_entry_idx_ >= 0);
|
|
|
|
if (!skip_linear_scan) {
|
|
// Linear search (within restart block) for first key >= target
|
|
uint32_t max_offset;
|
|
if (index + 1 < num_restarts_) {
|
|
// We are in a non-last restart interval. Since `BinarySeek()` guarantees
|
|
// the next restart key is strictly greater than `target`, we can
|
|
// terminate upon reaching it without any additional key comparison.
|
|
max_offset = GetRestartPoint(index + 1);
|
|
} else {
|
|
// We are in the last restart interval. The while-loop will terminate by
|
|
// `Valid()` returning false upon advancing past the block's last key.
|
|
max_offset = std::numeric_limits<uint32_t>::max();
|
|
}
|
|
while (true) {
|
|
NextImpl();
|
|
if (!Valid()) {
|
|
// TODO(cbi): per key-value checksum will not be verified in UpdateKey()
|
|
// since Valid() will returns false.
|
|
break;
|
|
}
|
|
if (current_ == max_offset) {
|
|
assert(CompareCurrentKey(target) > 0);
|
|
break;
|
|
} else if (CompareCurrentKey(target) >= 0) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get the key slice at a given restart point index.
|
|
template <class TValue>
|
|
template <typename DecodeKeyFunc>
|
|
bool BlockIter<TValue>::GetRestartKey(uint32_t index, Slice* key) {
|
|
uint32_t region_offset = GetRestartPoint(index);
|
|
uint32_t shared, non_shared, value_offset;
|
|
const char* key_ptr =
|
|
DecodeKeyFunc()(data_ + region_offset, data_ + restarts_, &shared,
|
|
&non_shared, values_section_ ? &value_offset : nullptr);
|
|
if (key_ptr == nullptr || (shared != 0)) {
|
|
CorruptionError();
|
|
return false;
|
|
}
|
|
*key = Slice(key_ptr, non_shared);
|
|
return true;
|
|
}
|
|
|
|
// Searches in restart array using binary search to find the starting restart
|
|
// point for the linear scan, and stores it in `*index`. Assumes restart array
|
|
// does not contain duplicate keys.
|
|
//
|
|
// It is guaranteed that the restart key at `*index + 1`
|
|
// is strictly greater than `target` or does not exist (this can be used to
|
|
// elide a comparison when linear scan reaches all the way to the next restart
|
|
// key). Furthermore, `*skip_linear_scan` is set to indicate whether the
|
|
// `*index`th restart key is the final result so that key does not need to be
|
|
// compared again later.
|
|
template <class TValue>
|
|
template <typename DecodeKeyFunc>
|
|
bool BlockIter<TValue>::BinarySeekRestartPointIndex(const Slice& target,
|
|
uint32_t* index,
|
|
bool* skip_linear_scan) {
|
|
if (restarts_ == 0) {
|
|
// SST files dedicated to range tombstones are written with index blocks
|
|
// that have no keys while also having `num_restarts_ == 1`. This would
|
|
// cause a problem as we'd try to access the first key which does not exist.
|
|
// We identify such blocks by the offset at which their restarts are stored,
|
|
// and return false to prevent any attempted key accesses.
|
|
return false;
|
|
}
|
|
|
|
*skip_linear_scan = false;
|
|
// Loop invariants:
|
|
// - Restart key at index `left` is less than or equal to the target key. The
|
|
// sentinel index `-1` is considered to have a key that is less than all
|
|
// keys. Doing this allows us to avoid a bounds check on left.
|
|
// - Any restart keys after index `right` are strictly greater than the target
|
|
// key.
|
|
int64_t left = -1;
|
|
int64_t right = num_restarts_ - 1;
|
|
|
|
while (left != right) {
|
|
// The `mid` is computed by rounding up so it lands in (`left`, `right`].
|
|
int64_t mid = left + (right - left + 1) / 2;
|
|
assert(left < mid && mid <= right);
|
|
|
|
Slice mid_key;
|
|
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
|
|
return false;
|
|
}
|
|
|
|
UpdateRawKeyAndMaybePadMinTimestamp(mid_key);
|
|
|
|
int cmp = CompareCurrentKey(target);
|
|
if (cmp < 0) {
|
|
// Key at "mid" is smaller than "target". Therefore all
|
|
// blocks before "mid" are uninteresting.
|
|
left = mid;
|
|
} else if (cmp > 0) {
|
|
// Key at "mid" is >= "target". Therefore all blocks at or
|
|
// after "mid" are uninteresting.
|
|
right = mid - 1;
|
|
} else {
|
|
*skip_linear_scan = true;
|
|
left = right = mid;
|
|
}
|
|
}
|
|
|
|
if (left == -1) {
|
|
// All keys in the block were strictly greater than `target`. So the very
|
|
// first key in the block is the final seek result.
|
|
*skip_linear_scan = true;
|
|
*index = 0;
|
|
} else {
|
|
*index = static_cast<uint32_t>(left);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Similar effects to BinarySeekRestartPointIndex, except it uses a different
|
|
// algorithm to search for the restart point index (i.e. interpolation search).
|
|
// Interpolation search is typically more efficient for uniformly distributed
|
|
// datasets.
|
|
//
|
|
// Typically, interpolation search requires an integer "value". But because we
|
|
// are searching through variable length binary slices, we must estimate an
|
|
// integer value for each key. Currently, the value is set to be the first 8
|
|
// bytes (read big-endian) that do not share a prefix with the start and end
|
|
// key. As a side effect, this can really only be used with the
|
|
// BytewiseComparator().
|
|
template <class TValue>
|
|
template <typename DecodeKeyFunc>
|
|
bool BlockIter<TValue>::InterpolationSeekRestartPointIndex(
|
|
const Slice& target, uint32_t* index, bool* skip_linear_scan) {
|
|
static constexpr int64_t kGuardLen = 8;
|
|
static constexpr uint64_t kMaxPoorSearches = 8;
|
|
|
|
if (restarts_ == 0) {
|
|
return false;
|
|
}
|
|
|
|
*skip_linear_scan = false;
|
|
// Currently it is assumed that comparator is always bytewise comparator, but
|
|
// it may also be useful to to generalize to reverse bytewise in the future.
|
|
assert(icmp_.user_comparator() == BytewiseComparator());
|
|
|
|
int64_t left = -1;
|
|
int64_t right = num_restarts_ - 1;
|
|
size_t shared_user_prefix_len = 0;
|
|
|
|
Slice left_key;
|
|
Slice right_key;
|
|
Slice left_key_suffix;
|
|
Slice right_key_suffix;
|
|
Slice target_suffix = target;
|
|
bool seek_failed = false;
|
|
bool first_iter = true;
|
|
uint64_t left_val = 0;
|
|
uint64_t right_val = 0;
|
|
uint64_t target_val = 0;
|
|
|
|
// A poor search is when less than half the search space is reduced, because
|
|
// binary search would do better. When there are kMaxPoorSearches in a row,
|
|
// then fallback to binary search. This helps bound worse cast performance.
|
|
uint64_t continuous_poor_searches = 0;
|
|
|
|
// Loop invariants while not first iteration AND seek has not failed:
|
|
// - arr[usable_left] = left_key, arr[right] = right_key
|
|
// - left < mid <= right, and arr[left] < target < arr[right + 1]
|
|
//
|
|
// The first iteration is used as an early optimization to determine initial
|
|
// bounds, and whether target is within those bounds.
|
|
const bool is_user_key = raw_key_.IsUserKey();
|
|
const Slice target_user_key = is_user_key ? target : ExtractUserKey(target);
|
|
while (left != right) {
|
|
int64_t mid = 0;
|
|
|
|
// If either search window is small or we've bad numerous bad guesses, then
|
|
// fallback to binary search
|
|
seek_failed = (right - left <= kGuardLen) ||
|
|
continuous_poor_searches >= kMaxPoorSearches;
|
|
|
|
if (!seek_failed) {
|
|
// Interpolation seek reads left and right boundaries anyways, so we can
|
|
// set left = 0. The invariant that left <= target is still held because
|
|
// we early exit if left > target for the first iteration.
|
|
const uint32_t usable_left =
|
|
static_cast<uint32_t>(std::max<int64_t>(left, 0));
|
|
|
|
// First iteration: decode both boundary keys and compute shared prefix.
|
|
if (first_iter) {
|
|
if (!GetRestartKey<DecodeKeyFunc>(usable_left, &left_key)) {
|
|
return false;
|
|
}
|
|
|
|
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
|
|
&right_key)) {
|
|
return false;
|
|
}
|
|
|
|
// Compute the shared prefix length between the user key portions of
|
|
// the boundary keys. This is used to "normalize" the values calculated
|
|
// during interpolation search.
|
|
shared_user_prefix_len = left_key.difference_offset(right_key);
|
|
if (!is_user_key) {
|
|
// Ensure shared_user_prefix_len is only limited to user key. Suppose
|
|
// that the shared prefix of both keys are extended into the internal
|
|
// footer. If they are not the same user keys, then it is guaranteed
|
|
// left is the shorter one due to bytewise comparator. For reverse
|
|
// bytewise, this would be flipped.
|
|
shared_user_prefix_len = std::min<size_t>(
|
|
shared_user_prefix_len, left_key.size() - kNumInternalBytes);
|
|
assert(shared_user_prefix_len <=
|
|
right_key.size() - kNumInternalBytes);
|
|
}
|
|
|
|
left_val =
|
|
ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
|
|
right_val =
|
|
ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
|
|
target_val =
|
|
ReadBe64FromKey(target, is_user_key, shared_user_prefix_len);
|
|
}
|
|
|
|
assert(shared_user_prefix_len <= left_key.size() &&
|
|
shared_user_prefix_len <= right_key.size());
|
|
|
|
if (first_iter && shared_user_prefix_len > 0) {
|
|
// It is not guaranteed that the shared_prefix of the left and right
|
|
// boundaries is a valid prefix of the target. If it is not, then we can
|
|
// early exit.
|
|
size_t cmp_len =
|
|
std::min(target_user_key.size(), shared_user_prefix_len);
|
|
int cmp = memcmp(target_user_key.data(), left_key.data(), cmp_len);
|
|
if (cmp < 0 || (cmp == 0 && cmp_len < shared_user_prefix_len)) {
|
|
#ifndef NDEBUG
|
|
IterKey tmp_key;
|
|
tmp_key.SetIsUserKey(is_user_key);
|
|
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
|
|
assert(CompareKey(tmp_key, target) >= 0);
|
|
#endif
|
|
// if target size is less than shared_prefix length, and cmp == 0,
|
|
// then it is guaranteed <= left
|
|
*skip_linear_scan = true;
|
|
*index = usable_left;
|
|
return true;
|
|
} else if (cmp > 0) {
|
|
#ifndef NDEBUG
|
|
IterKey tmp_key;
|
|
tmp_key.SetIsUserKey(is_user_key);
|
|
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
|
|
assert(CompareKey(tmp_key, target) < 0);
|
|
#endif
|
|
*index = static_cast<uint32_t>(right);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
assert(shared_user_prefix_len <= target_user_key.size());
|
|
assert(memcmp(left_key.data(), target_user_key.data(),
|
|
shared_user_prefix_len) == 0);
|
|
assert(memcmp(right_key.data(), target_user_key.data(),
|
|
shared_user_prefix_len) == 0);
|
|
|
|
if (first_iter) {
|
|
left_key_suffix = Slice(left_key.data() + shared_user_prefix_len,
|
|
left_key.size() - shared_user_prefix_len);
|
|
right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
|
|
right_key.size() - shared_user_prefix_len);
|
|
target_suffix = Slice(target.data() + shared_user_prefix_len,
|
|
target.size() - shared_user_prefix_len);
|
|
}
|
|
|
|
if (left_val > right_val) {
|
|
CorruptionError("left key is greater than right key");
|
|
return false;
|
|
}
|
|
|
|
bool lte_left = false;
|
|
bool gt_right = false;
|
|
|
|
if (target_val < left_val) {
|
|
assert(first_iter);
|
|
assert(CompareKey(left_key_suffix, target_suffix) > 0);
|
|
lte_left = true;
|
|
} else if (target_val == left_val) {
|
|
// target_val == left_val doesn't imply target == left_key
|
|
// because ReadBe64FromKey only reads 8 bytes and skips sequence
|
|
// numbers. We need to check actual key order.
|
|
if (CompareKey(left_key_suffix, target_suffix) >= 0) {
|
|
assert(first_iter);
|
|
lte_left = true;
|
|
}
|
|
}
|
|
|
|
if (!lte_left && !seek_failed) {
|
|
if (target_val > right_val) {
|
|
// note that we only ever guarantee arr[target] < arr[right + 1], so
|
|
// it is possible to end up here even on non-first iteration
|
|
assert(CompareKey(right_key_suffix, target_suffix) < 0);
|
|
gt_right = true;
|
|
} else if (right_val == left_val) {
|
|
// cannot divide by 0
|
|
seek_failed = true;
|
|
}
|
|
}
|
|
|
|
// early exit if key is not within bounds
|
|
if (lte_left) {
|
|
#ifndef NDEBUG
|
|
assert(!seek_failed);
|
|
IterKey tmp_key;
|
|
tmp_key.SetIsUserKey(is_user_key);
|
|
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, left_key);
|
|
assert(CompareKey(tmp_key, target) >= 0);
|
|
#endif
|
|
*skip_linear_scan = true;
|
|
*index = usable_left;
|
|
return true;
|
|
}
|
|
if (gt_right) {
|
|
#ifndef NDEBUG
|
|
assert(!seek_failed);
|
|
IterKey tmp_key;
|
|
tmp_key.SetIsUserKey(is_user_key);
|
|
UpdateRawKeyAndMaybePadMinTimestamp(tmp_key, right_key);
|
|
assert(CompareKey(tmp_key, target) < 0);
|
|
#endif
|
|
*index = static_cast<uint32_t>(right);
|
|
return true;
|
|
}
|
|
|
|
if (!seek_failed) {
|
|
#ifdef HAVE_UINT128_EXTENSION
|
|
__uint128_t range = right - usable_left;
|
|
__uint128_t target_delta = target_val - left_val;
|
|
uint64_t range_delta = right_val - left_val;
|
|
int64_t offset =
|
|
static_cast<int64_t>(range * target_delta / range_delta);
|
|
#else
|
|
double ratio = static_cast<double>(target_val - left_val) /
|
|
static_cast<double>(right_val - left_val);
|
|
assert(0 <= ratio && ratio <= 1);
|
|
int64_t range = right - usable_left;
|
|
int64_t offset = static_cast<int64_t>(range * ratio);
|
|
#endif
|
|
left = usable_left; // can reduce search space by 1
|
|
mid = usable_left + offset;
|
|
assert(mid <= right);
|
|
if (mid == usable_left) {
|
|
// this is to guarantee progress and avoid infinite loop
|
|
++mid;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (seek_failed) {
|
|
// Fallback to binary seek
|
|
mid = left + (right - left + 1) / 2;
|
|
}
|
|
|
|
assert(left < mid && mid <= right);
|
|
|
|
Slice mid_key;
|
|
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(mid), &mid_key)) {
|
|
return false;
|
|
}
|
|
|
|
Slice mid_key_suffix(mid_key.data() + shared_user_prefix_len,
|
|
mid_key.size() - shared_user_prefix_len);
|
|
|
|
UpdateRawKeyAndMaybePadMinTimestamp(mid_key_suffix);
|
|
int cmp = CompareCurrentKey(target_suffix);
|
|
|
|
int64_t previous_search_space = right - left;
|
|
if (cmp < 0) {
|
|
left = mid;
|
|
left_key = mid_key;
|
|
left_key_suffix = mid_key_suffix;
|
|
left_val = ReadBe64FromKey(left_key, is_user_key, shared_user_prefix_len);
|
|
} else if (cmp > 0) {
|
|
right = mid - 1;
|
|
if (!seek_failed && left != right) {
|
|
if (!GetRestartKey<DecodeKeyFunc>(static_cast<uint32_t>(right),
|
|
&right_key)) {
|
|
return false;
|
|
}
|
|
right_key_suffix = Slice(right_key.data() + shared_user_prefix_len,
|
|
right_key.size() - shared_user_prefix_len);
|
|
right_val =
|
|
ReadBe64FromKey(right_key, is_user_key, shared_user_prefix_len);
|
|
}
|
|
} else {
|
|
*skip_linear_scan = true;
|
|
left = right = mid;
|
|
}
|
|
|
|
// If seach space is not reduced by at least half, good chance this data is
|
|
// not uniform.
|
|
int64_t new_search_space = right - left;
|
|
if (new_search_space > previous_search_space / 2) {
|
|
++continuous_poor_searches;
|
|
} else {
|
|
continuous_poor_searches = 0;
|
|
}
|
|
|
|
first_iter = false;
|
|
}
|
|
|
|
if (left == -1) {
|
|
// All keys in the block were strictly greater than `target`. So the very
|
|
// first key in the block is the final seek result.
|
|
*skip_linear_scan = true;
|
|
*index = 0;
|
|
} else {
|
|
*index = static_cast<uint32_t>(left);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
// Compare target key and the block key of the block of `block_index`.
|
|
// Return -1 if error.
|
|
int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
|
|
Slice block_key;
|
|
bool ok = value_delta_encoded_
|
|
? GetRestartKey<DecodeKeyV4>(block_index, &block_key)
|
|
: GetRestartKey<DecodeKey>(block_index, &block_key);
|
|
if (!ok) {
|
|
return 1; // Return target is smaller
|
|
}
|
|
UpdateRawKeyAndMaybePadMinTimestamp(block_key);
|
|
return CompareCurrentKey(target);
|
|
}
|
|
|
|
// Binary search in block_ids to find the first block
|
|
// with a key >= target
|
|
bool IndexBlockIter::BinaryBlockIndexSeek(const Slice& target,
|
|
uint32_t* block_ids, uint32_t left,
|
|
uint32_t right, uint32_t* index,
|
|
bool* prefix_may_exist) {
|
|
assert(left <= right);
|
|
assert(index);
|
|
assert(prefix_may_exist);
|
|
*prefix_may_exist = true;
|
|
uint32_t left_bound = left;
|
|
|
|
while (left <= right) {
|
|
uint32_t mid = (right + left) / 2;
|
|
|
|
int cmp = CompareBlockKey(block_ids[mid], target);
|
|
if (!status_.ok()) {
|
|
return false;
|
|
}
|
|
if (cmp < 0) {
|
|
// Key at "target" is larger than "mid". Therefore all
|
|
// blocks before or at "mid" are uninteresting.
|
|
left = mid + 1;
|
|
} else {
|
|
// Key at "target" is <= "mid". Therefore all blocks
|
|
// after "mid" are uninteresting.
|
|
// If there is only one block left, we found it.
|
|
if (left == right) {
|
|
break;
|
|
}
|
|
right = mid;
|
|
}
|
|
}
|
|
|
|
if (left == right) {
|
|
// In one of the two following cases:
|
|
// (1) left is the first one of block_ids
|
|
// (2) there is a gap of blocks between block of `left` and `left-1`.
|
|
// we can further distinguish the case of key in the block or key not
|
|
// existing, by comparing the target key and the key of the previous
|
|
// block to the left of the block found.
|
|
if (block_ids[left] > 0 &&
|
|
(left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
|
|
CompareBlockKey(block_ids[left] - 1, target) > 0) {
|
|
current_ = GetKeysEndOffset();
|
|
*prefix_may_exist = false;
|
|
return false;
|
|
}
|
|
|
|
*index = block_ids[left];
|
|
return true;
|
|
} else {
|
|
assert(left > right);
|
|
|
|
// If the next block key is larger than seek key, it is possible that
|
|
// no key shares the prefix with `target`, or all keys with the same
|
|
// prefix as `target` are smaller than prefix. In the latter case,
|
|
// we are mandated to set the position the same as the total order.
|
|
// In the latter case, either:
|
|
// (1) `target` falls into the range of the next block. In this case,
|
|
// we can place the iterator to the next block, or
|
|
// (2) `target` is larger than all block keys. In this case we can
|
|
// keep the iterator invalidate without setting `prefix_may_exist`
|
|
// to false.
|
|
// We might sometimes end up with setting the total order position
|
|
// while there is no key sharing the prefix as `target`, but it
|
|
// still follows the contract.
|
|
uint32_t right_index = block_ids[right];
|
|
assert(right_index + 1 <= num_restarts_);
|
|
if (right_index + 1 < num_restarts_) {
|
|
if (CompareBlockKey(right_index + 1, target) >= 0) {
|
|
*index = right_index + 1;
|
|
return true;
|
|
} else {
|
|
// We have to set the flag here because we are not positioning
|
|
// the iterator to the total order position.
|
|
*prefix_may_exist = false;
|
|
}
|
|
}
|
|
|
|
// Mark iterator invalid
|
|
current_ = GetKeysEndOffset();
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool IndexBlockIter::PrefixSeek(const Slice& target, uint32_t* index,
|
|
bool* prefix_may_exist) {
|
|
assert(index);
|
|
assert(prefix_may_exist);
|
|
assert(prefix_index_);
|
|
*prefix_may_exist = true;
|
|
Slice seek_key = target;
|
|
if (raw_key_.IsUserKey()) {
|
|
seek_key = ExtractUserKey(target);
|
|
}
|
|
uint32_t* block_ids = nullptr;
|
|
uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
|
|
|
|
if (num_blocks == 0) {
|
|
current_ = GetKeysEndOffset();
|
|
*prefix_may_exist = false;
|
|
return false;
|
|
} else {
|
|
assert(block_ids);
|
|
return BinaryBlockIndexSeek(seek_key, block_ids, 0, num_blocks - 1, index,
|
|
prefix_may_exist);
|
|
}
|
|
}
|
|
|
|
BlockBasedTableOptions::DataBlockIndexType Block::IndexType() const {
|
|
assert(size() >= DataBlockFooter::kMinEncodedLength);
|
|
Slice input(data(), size());
|
|
DataBlockFooter footer;
|
|
footer.DecodeFrom(&input).PermitUncheckedError();
|
|
return footer.index_type;
|
|
}
|
|
|
|
Block::~Block() {
|
|
// This sync point can be re-enabled if RocksDB can control the
|
|
// initialization order of any/all static options created by the user.
|
|
// TEST_SYNC_POINT("Block::~Block");
|
|
delete[] kv_checksum_;
|
|
}
|
|
|
|
Status Block::GetCorruptionStatus() const {
|
|
// Re-process the footer to get a detailed error status.
|
|
// This should only be called when size() == 0 (error marker).
|
|
assert(size() == 0);
|
|
// When size() == 0 and restart_offset_ != 0, restart_offset_ stores the
|
|
// original data size for re-decoding the footer to get detailed error.
|
|
if (restart_offset_ == 0) {
|
|
return Status::Corruption("bad block contents");
|
|
}
|
|
Slice input(contents_.data.data(), restart_offset_);
|
|
DataBlockFooter footer;
|
|
Status s = footer.DecodeFrom(&input);
|
|
if (!s.ok()) {
|
|
return s; // Return the detailed error from DecodeFrom
|
|
}
|
|
// Footer decoded OK, so error was in later processing (shouldn't happen)
|
|
DEBUG_FAIL("ok status on presumed bad block contents");
|
|
return Status::Corruption("presumed bad block contents");
|
|
}
|
|
|
|
Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit,
|
|
Statistics* statistics, uint32_t restart_interval)
|
|
: contents_(std::move(contents)),
|
|
restart_offset_(0),
|
|
num_restarts_(0),
|
|
block_restart_interval_(restart_interval) {
|
|
TEST_SYNC_POINT("Block::Block:0");
|
|
auto& size = contents_.data.size_;
|
|
// `contents` is assumed to be uncompressed in the proper format
|
|
Slice input(contents_.data.data(), size);
|
|
DataBlockFooter footer;
|
|
Status s = footer.DecodeFrom(&input);
|
|
if (!s.ok()) {
|
|
// Save original size for GetCorruptionStatus() to re-decode footer
|
|
restart_offset_ = static_cast<uint32_t>(size);
|
|
size = 0; // Error marker
|
|
} else {
|
|
// After DecodeFrom, input has the footer (and values_section_offset if
|
|
// separated_kv) removed. Each case below may strip additional suffix
|
|
// (e.g., hash index) so that input ends with just the restart array.
|
|
num_restarts_ = footer.num_restarts;
|
|
is_uniform_ = footer.is_uniform;
|
|
switch (footer.index_type) {
|
|
case BlockBasedTableOptions::kDataBlockBinarySearch:
|
|
break;
|
|
case BlockBasedTableOptions::kDataBlockBinaryAndHash:
|
|
if (input.size() < sizeof(uint16_t) /* NUM_BUCK */) {
|
|
size = 0;
|
|
break;
|
|
}
|
|
uint16_t map_offset;
|
|
data_block_hash_index_.Initialize(contents_.data.data(),
|
|
static_cast<uint16_t>(input.size()),
|
|
&map_offset);
|
|
// Strip the hash index, leaving just data + restarts
|
|
input.remove_suffix(input.size() - map_offset);
|
|
break;
|
|
default:
|
|
size = 0; // Error marker
|
|
}
|
|
// After the switch, input should end with restarts[num_restarts_]
|
|
if (size != 0) {
|
|
if (input.size() < num_restarts_ * sizeof(uint32_t)) {
|
|
size = 0; // Block too small for the declared number of restarts
|
|
} else {
|
|
restart_offset_ = static_cast<uint32_t>(input.size()) -
|
|
num_restarts_ * sizeof(uint32_t);
|
|
}
|
|
}
|
|
// Set up values_section_ from footer if separated KV storage is used
|
|
if (size != 0 && footer.separated_kv) {
|
|
if (footer.values_section_offset > restart_offset_) {
|
|
size = 0; // Error marker
|
|
} else {
|
|
values_section_ = data() + footer.values_section_offset;
|
|
}
|
|
}
|
|
}
|
|
if (read_amp_bytes_per_bit != 0 && statistics && size != 0) {
|
|
read_amp_bitmap_.reset(new BlockReadAmpBitmap(
|
|
restart_offset_, read_amp_bytes_per_bit, statistics));
|
|
}
|
|
}
|
|
|
|
void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key,
|
|
const Comparator* raw_ucmp) {
|
|
protection_bytes_per_key_ = 0;
|
|
if (protection_bytes_per_key > 0 && num_restarts_ > 0) {
|
|
// NewDataIterator() is called with protection_bytes_per_key_ = 0.
|
|
// This is intended since checksum is not constructed yet.
|
|
//
|
|
// We do not know global_seqno yet, so checksum computation and
|
|
// verification all assume global_seqno = 0.
|
|
// TODO(yuzhangyu): handle the implication of padding timestamp for kv
|
|
// protection.
|
|
std::unique_ptr<DataBlockIter> iter{NewDataIterator(
|
|
raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */,
|
|
nullptr /* stats */, true /* block_contents_pinned */,
|
|
true /* user_defined_timestamps_persisted */)};
|
|
if (iter->status().ok()) {
|
|
// Only calculate restart interval if not already set via table properties
|
|
if (block_restart_interval_ == 0) {
|
|
block_restart_interval_ = iter->GetRestartInterval();
|
|
}
|
|
}
|
|
uint32_t num_keys = 0;
|
|
if (iter->status().ok()) {
|
|
num_keys = iter->NumberOfKeys(block_restart_interval_);
|
|
}
|
|
if (iter->status().ok()) {
|
|
checksum_size_ = num_keys * protection_bytes_per_key;
|
|
kv_checksum_ = new char[(size_t)checksum_size_];
|
|
size_t i = 0;
|
|
iter->SeekToFirst();
|
|
while (iter->Valid()) {
|
|
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
|
|
iter->key(), iter->value());
|
|
iter->Next();
|
|
i += protection_bytes_per_key;
|
|
}
|
|
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
|
|
}
|
|
if (!iter->status().ok()) {
|
|
contents_.data.size_ = 0; // Error marker
|
|
return;
|
|
}
|
|
protection_bytes_per_key_ = protection_bytes_per_key;
|
|
}
|
|
}
|
|
|
|
void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key,
|
|
const Comparator* raw_ucmp,
|
|
bool value_is_full,
|
|
bool index_has_first_key) {
|
|
protection_bytes_per_key_ = 0;
|
|
if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
|
|
// Note that `global_seqno` and `key_includes_seq` are hardcoded here.
|
|
// They do not impact how the index block is parsed. During checksum
|
|
// construction/verification, we use the entire key buffer from
|
|
// raw_key_.GetKey() returned by iter->key() as the `key` part of
|
|
// key-value checksum, and the content of this buffer do not change for
|
|
// different values of `global_seqno` or `key_includes_seq`.
|
|
// TODO(yuzhangyu): handle the implication of padding timestamp for kv
|
|
// protection.
|
|
std::unique_ptr<IndexBlockIter> iter{NewIndexIterator(
|
|
raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr,
|
|
nullptr /* Statistics */, true /* total_order_seek */,
|
|
index_has_first_key /* have_first_key */, false /* key_includes_seq */,
|
|
value_is_full, true /* block_contents_pinned */,
|
|
true /* user_defined_timestamps_persisted*/,
|
|
nullptr /* prefix_index */)};
|
|
if (iter->status().ok()) {
|
|
// Only calculate restart interval if not already set via table properties
|
|
if (block_restart_interval_ == 0) {
|
|
block_restart_interval_ = iter->GetRestartInterval();
|
|
}
|
|
}
|
|
uint32_t num_keys = 0;
|
|
if (iter->status().ok()) {
|
|
num_keys = iter->NumberOfKeys(block_restart_interval_);
|
|
}
|
|
if (iter->status().ok()) {
|
|
checksum_size_ = num_keys * protection_bytes_per_key;
|
|
kv_checksum_ = new char[(size_t)checksum_size_];
|
|
iter->SeekToFirst();
|
|
size_t i = 0;
|
|
while (iter->Valid()) {
|
|
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
|
|
iter->key(), iter->raw_value());
|
|
iter->Next();
|
|
i += protection_bytes_per_key;
|
|
}
|
|
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
|
|
}
|
|
if (!iter->status().ok()) {
|
|
contents_.data.size_ = 0; // Error marker
|
|
return;
|
|
}
|
|
protection_bytes_per_key_ = protection_bytes_per_key;
|
|
}
|
|
}
|
|
|
|
void Block::InitializeMetaIndexBlockProtectionInfo(
|
|
uint8_t protection_bytes_per_key) {
|
|
protection_bytes_per_key_ = 0;
|
|
if (num_restarts_ > 0 && protection_bytes_per_key > 0) {
|
|
std::unique_ptr<MetaBlockIter> iter{
|
|
NewMetaIterator(true /* block_contents_pinned */)};
|
|
if (iter->status().ok()) {
|
|
block_restart_interval_ = iter->GetRestartInterval();
|
|
}
|
|
uint32_t num_keys = 0;
|
|
if (iter->status().ok()) {
|
|
num_keys = iter->NumberOfKeys(block_restart_interval_);
|
|
}
|
|
if (iter->status().ok()) {
|
|
checksum_size_ = num_keys * protection_bytes_per_key;
|
|
kv_checksum_ = new char[(size_t)checksum_size_];
|
|
iter->SeekToFirst();
|
|
size_t i = 0;
|
|
while (iter->Valid()) {
|
|
GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key,
|
|
iter->key(), iter->value());
|
|
iter->Next();
|
|
i += protection_bytes_per_key;
|
|
}
|
|
assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key);
|
|
}
|
|
if (!iter->status().ok()) {
|
|
contents_.data.size_ = 0; // Error marker
|
|
return;
|
|
}
|
|
protection_bytes_per_key_ = protection_bytes_per_key;
|
|
}
|
|
}
|
|
|
|
MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) {
|
|
MetaBlockIter* iter = new MetaBlockIter();
|
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
iter->Invalidate(GetCorruptionStatus());
|
|
return iter;
|
|
} else if (num_restarts_ == 0) {
|
|
// Empty block.
|
|
iter->Invalidate(Status::OK());
|
|
} else {
|
|
iter->Initialize(data(), restart_offset_, num_restarts_,
|
|
block_contents_pinned, protection_bytes_per_key_,
|
|
kv_checksum_, block_restart_interval_, values_section_);
|
|
}
|
|
return iter;
|
|
}
|
|
|
|
DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp,
|
|
SequenceNumber global_seqno,
|
|
DataBlockIter* iter, Statistics* stats,
|
|
bool block_contents_pinned,
|
|
bool user_defined_timestamps_persisted) {
|
|
DataBlockIter* ret_iter;
|
|
if (iter != nullptr) {
|
|
ret_iter = iter;
|
|
} else {
|
|
ret_iter = new DataBlockIter;
|
|
}
|
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
ret_iter->Invalidate(GetCorruptionStatus());
|
|
return ret_iter;
|
|
}
|
|
if (num_restarts_ == 0) {
|
|
// Empty block.
|
|
ret_iter->Invalidate(Status::OK());
|
|
return ret_iter;
|
|
} else {
|
|
ret_iter->Initialize(
|
|
raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
|
|
read_amp_bitmap_.get(), block_contents_pinned,
|
|
user_defined_timestamps_persisted,
|
|
data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr,
|
|
protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
|
|
values_section_);
|
|
if (read_amp_bitmap_) {
|
|
if (read_amp_bitmap_->GetStatistics() != stats) {
|
|
// DB changed the Statistics pointer, we need to notify
|
|
// read_amp_bitmap_
|
|
read_amp_bitmap_->SetStatistics(stats);
|
|
}
|
|
}
|
|
}
|
|
|
|
return ret_iter;
|
|
}
|
|
|
|
IndexBlockIter* Block::NewIndexIterator(
|
|
const Comparator* raw_ucmp, SequenceNumber global_seqno,
|
|
IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek,
|
|
bool have_first_key, bool key_includes_seq, bool value_is_full,
|
|
bool block_contents_pinned, bool user_defined_timestamps_persisted,
|
|
BlockPrefixIndex* prefix_index,
|
|
BlockBasedTableOptions::BlockSearchType index_block_search_type) {
|
|
IndexBlockIter* ret_iter;
|
|
if (iter != nullptr) {
|
|
ret_iter = iter;
|
|
} else {
|
|
ret_iter = new IndexBlockIter;
|
|
}
|
|
if (size() < 2 * sizeof(uint32_t)) {
|
|
ret_iter->Invalidate(GetCorruptionStatus());
|
|
return ret_iter;
|
|
}
|
|
if (num_restarts_ == 0) {
|
|
// Empty block.
|
|
ret_iter->Invalidate(Status::OK());
|
|
return ret_iter;
|
|
} else {
|
|
BlockPrefixIndex* prefix_index_ptr =
|
|
total_order_seek ? nullptr : prefix_index;
|
|
|
|
// Resolve kAuto to a concrete search type based on the block's
|
|
// uniformity flag. Interpolation search requires bytewise comparator;
|
|
// fall back to binary search otherwise.
|
|
auto resolved_search_type = index_block_search_type;
|
|
if (resolved_search_type == BlockBasedTableOptions::kAuto) {
|
|
resolved_search_type = (is_uniform_ && raw_ucmp == BytewiseComparator())
|
|
? BlockBasedTableOptions::kInterpolation
|
|
: BlockBasedTableOptions::kBinary;
|
|
}
|
|
|
|
ret_iter->Initialize(
|
|
raw_ucmp, data(), restart_offset_, num_restarts_, global_seqno,
|
|
prefix_index_ptr, have_first_key, key_includes_seq, value_is_full,
|
|
block_contents_pinned, user_defined_timestamps_persisted,
|
|
protection_bytes_per_key_, kv_checksum_, block_restart_interval_,
|
|
values_section_, resolved_search_type);
|
|
}
|
|
|
|
return ret_iter;
|
|
}
|
|
|
|
size_t Block::ApproximateMemoryUsage() const {
|
|
size_t usage = usable_size();
|
|
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
|
|
usage += malloc_usable_size((void*)this);
|
|
#else
|
|
usage += sizeof(*this);
|
|
#endif // ROCKSDB_MALLOC_USABLE_SIZE
|
|
if (read_amp_bitmap_) {
|
|
usage += read_amp_bitmap_->ApproximateMemoryUsage();
|
|
}
|
|
usage += checksum_size_;
|
|
return usage;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|