Summary: Add automatic per-block interpolation search selection (`kAuto` mode) for index blocks. During SST construction, each index block's key distribution is analyzed using the coefficient of variation (CV) of gaps between restart-point keys. Blocks with uniformly distributed keys are flagged via a new bit in the data block footer, and at read time, `kAuto` resolves to interpolation search for uniform blocks and binary search otherwise. ## Key changes - **New `BlockSearchType::kAuto` enum value**: Resolves per-block at read time to either `kInterpolation` or `kBinary` based on the block's uniformity flag. Falls back to `kBinary` on older versions that don't recognize it. - **Write-path uniformity analysis**: `BlockBuilder::ScanForUniformity()` uses Welford's online algorithm to incrementally compute the CV of key gaps at restart points. The result is stored in a new bit (bit 30) of the data block footer's packed restart count. - **New table option `uniform_cv_threshold`** (default: -1 `disabled`): Controls how strict the uniformity check is. Set to negative to disable. Exposed in C++, Java (JNI), and `db_bench`. - **Code reorganization**: Block entry decode helpers (`DecodeEntry`, `DecodeKey`, `DecodeKeyV4`, `ReadBe64FromKey`) moved from `block.cc` to a new shared header `block_util.h` so they can be reused by `BlockBuilder` on the write path. - **New histogram `BLOCK_KEY_DISTRIBUTION_CV`**: Records the CV (scaled by 10000) of each index block's key distribution for observability. - **Java bindings**: `IndexSearchType.kAuto`, `uniformCvThreshold` getter/setter, JNI portal constructor signature updated, and `HistogramType.BLOCK_KEY_DISTRIBUTION_CV` added. Pull Request resolved: https://github.com/facebook/rocksdb/pull/14383 Test Plan: - `IndexBlockTest.IndexValueEncodingTest` parameterized to include `kAuto` search type alongside `kBinary` and `kInterpolation`, verifying correct seek/iteration behavior across all combinations of key distributions, restart intervals, and key lengths. - Uniformity detection validated: blocks with uniform key distribution correctly set `is_uniform = true`, blocks with clustered/non-uniform keys set `is_uniform = false`. - Stress test coverage - Updated check_format_compatible to also include a "uniform" dataset. By default using uniform_cv_threshold=-1 does not result in an incompatibility issues. When manually changing the threshold (e.g. `uniform_cv_threshold=1000`), I see `bad block contents`, which is expected ## Benchmark readrandom with `fillrandom,compact -seed=1 --statistics`: | Benchmark | Branch | Params | avg ops/s | % change vs main | CV P50 | |-----------|--------|--------|-----------|------------------|--------| | readrandom | main | `binary_search, shortening=1` | 335,791 | baseline | N/A | | readrandom | feature | `binary_search, shortening=1` (default) | 335,749 | -0.0% | 1,500 | | readrandom | feature | `auto_search, shortening=1` (kAuto) | 366,832 | **+9.2%** | 1,500 | | readrandom | feature | `interpolation_search, shortening=1` | 366,598 | **+9.2%** | 1,500 | | readrandom | feature | `auto_search, shortening=2` (kAuto) | 344,631 | **+2.6%** | 1,030,000 | | readrandom | feature | `interpolation_search, shortening=2` | 201,178 | **-40.1%** | 1,030,000 | As seen with shortening=2, a non-uniform distribution produces a high CV, which does not use interpolation search. ## Write benchmark There is a write overhead which scans each restart entry for a block upon Finish. In practice this is very low because currently it is only applied to index blocks. See cpu profile (https://fburl.com/strobelight/io5hwj9h) here of `-benchmarks=fillseq,compact -compression_type=none -disable_wal=1`. Only 0.08% attributed to `ScanForUniformity`. Reviewed By: pdillinger Differential Revision: D94738890 Pulled By: joshkang97 fbshipit-source-id: 9661ac593c5fef89d49f3a8a027f1338a0c96766
156 lines
5 KiB
C++
156 lines
5 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
|
|
#include "db/dbformat.h"
|
|
#include "port/port.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "util/coding.h"
|
|
#include "util/math.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// Decode the next block entry starting at "p", storing the number of shared key
|
|
// bytes, non_shared key bytes, and the length of the value in "*shared",
|
|
// "*non_shared", and "*value_length", respectively. Will not dereference past
|
|
// "limit".
|
|
//
|
|
// If any errors are detected, returns nullptr. Otherwise, returns a
|
|
// pointer to the key delta (just past the three decoded values).
|
|
struct DecodeEntry {
|
|
inline const char* operator()(const char* p, const char* limit,
|
|
uint32_t* shared, uint32_t* non_shared,
|
|
uint32_t* value_length,
|
|
uint32_t* value_offset) {
|
|
// We need 2 bytes for shared and non_shared size. We also need one more
|
|
// byte either for value size or the actual value in case of value delta
|
|
// encoding.
|
|
assert(limit - p >= 3);
|
|
*shared = reinterpret_cast<const unsigned char*>(p)[0];
|
|
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
|
|
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
|
|
if ((*shared | *non_shared | *value_length) < 128) {
|
|
// Fast path: all three values are encoded in one byte each
|
|
p += 3;
|
|
} else {
|
|
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
if (value_offset) {
|
|
if ((p = GetVarint32Ptr(p, limit, value_offset)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
return p;
|
|
}
|
|
};
|
|
|
|
struct DecodeKey {
|
|
inline const char* operator()(const char* p, const char* limit,
|
|
uint32_t* shared, uint32_t* non_shared,
|
|
uint32_t* value_offset) {
|
|
uint32_t value_length;
|
|
return DecodeEntry()(p, limit, shared, non_shared, &value_length,
|
|
value_offset);
|
|
}
|
|
};
|
|
|
|
// In format_version 4, which is used by index blocks, the value size is not
|
|
// encoded before the entry, as the value is known to be the handle with the
|
|
// known size.
|
|
struct DecodeKeyV4 {
|
|
inline const char* operator()(const char* p, const char* limit,
|
|
uint32_t* shared, uint32_t* non_shared,
|
|
uint32_t* value_offset) {
|
|
// We need 2 bytes for shared and non_shared size. We also need one more
|
|
// byte either for value size or the actual value in case of value delta
|
|
// encoding.
|
|
if (limit - p < 3) {
|
|
return nullptr;
|
|
}
|
|
*shared = reinterpret_cast<const unsigned char*>(p)[0];
|
|
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
|
|
if ((*shared | *non_shared) < 128) {
|
|
// Fast path: all three values are encoded in one byte each
|
|
p += 2;
|
|
} else {
|
|
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
|
|
if (value_offset) {
|
|
if ((p = GetVarint32Ptr(p, limit, value_offset)) == nullptr) {
|
|
return nullptr;
|
|
}
|
|
}
|
|
return p;
|
|
}
|
|
};
|
|
|
|
struct DecodeEntryV4 {
|
|
inline const char* operator()(const char* p, const char* limit,
|
|
uint32_t* shared, uint32_t* non_shared,
|
|
uint32_t* value_length,
|
|
uint32_t* value_offset) {
|
|
assert(value_length);
|
|
|
|
*value_length = 0;
|
|
return DecodeKeyV4()(p, limit, shared, non_shared, value_offset);
|
|
}
|
|
};
|
|
|
|
// Read first 8 bytes (starting at offset) as big-endian uint64_t, padding
|
|
// with zeros on the right if the key is shorter. This preserves
|
|
// lexicographic ordering.
|
|
//
|
|
// If s.size() <= offset, then returns 0.
|
|
inline uint64_t ReadBe64FromKey(Slice s, bool is_user_key, size_t offset) {
|
|
if (!is_user_key) {
|
|
assert(s.size() >= kNumInternalBytes);
|
|
s = Slice(s.data(), s.size() - kNumInternalBytes);
|
|
}
|
|
offset = std::min(offset, s.size());
|
|
size_t remaining = s.size() - offset;
|
|
|
|
// fast path
|
|
if (remaining >= 8) {
|
|
uint64_t val;
|
|
memcpy(&val, s.data() + offset, sizeof(val));
|
|
if (port::kLittleEndian) {
|
|
return EndianSwapValue(val);
|
|
}
|
|
return val;
|
|
}
|
|
|
|
uint64_t val = 0;
|
|
for (size_t i = 0; i < remaining; i++) {
|
|
val = (val << 8) | static_cast<uint8_t>(s.data()[offset + i]);
|
|
}
|
|
if (remaining > 0) {
|
|
val <<= (8 - remaining) * 8; // Pad zeros on the right
|
|
}
|
|
return val;
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|