Summary: Add automatic per-block interpolation search selection (`kAuto` mode) for index blocks. During SST construction, each index block's key distribution is analyzed using the coefficient of variation (CV) of gaps between restart-point keys. Blocks with uniformly distributed keys are flagged via a new bit in the data block footer, and at read time, `kAuto` resolves to interpolation search for uniform blocks and binary search otherwise. ## Key changes - **New `BlockSearchType::kAuto` enum value**: Resolves per-block at read time to either `kInterpolation` or `kBinary` based on the block's uniformity flag. Falls back to `kBinary` on older versions that don't recognize it. - **Write-path uniformity analysis**: `BlockBuilder::ScanForUniformity()` uses Welford's online algorithm to incrementally compute the CV of key gaps at restart points. The result is stored in a new bit (bit 30) of the data block footer's packed restart count. - **New table option `uniform_cv_threshold`** (default: -1 `disabled`): Controls how strict the uniformity check is. Set to negative to disable. Exposed in C++, Java (JNI), and `db_bench`. - **Code reorganization**: Block entry decode helpers (`DecodeEntry`, `DecodeKey`, `DecodeKeyV4`, `ReadBe64FromKey`) moved from `block.cc` to a new shared header `block_util.h` so they can be reused by `BlockBuilder` on the write path. - **New histogram `BLOCK_KEY_DISTRIBUTION_CV`**: Records the CV (scaled by 10000) of each index block's key distribution for observability. - **Java bindings**: `IndexSearchType.kAuto`, `uniformCvThreshold` getter/setter, JNI portal constructor signature updated, and `HistogramType.BLOCK_KEY_DISTRIBUTION_CV` added. Pull Request resolved: https://github.com/facebook/rocksdb/pull/14383 Test Plan: - `IndexBlockTest.IndexValueEncodingTest` parameterized to include `kAuto` search type alongside `kBinary` and `kInterpolation`, verifying correct seek/iteration behavior across all combinations of key distributions, restart intervals, and key lengths. - Uniformity detection validated: blocks with uniform key distribution correctly set `is_uniform = true`, blocks with clustered/non-uniform keys set `is_uniform = false`. - Stress test coverage - Updated check_format_compatible to also include a "uniform" dataset. By default using uniform_cv_threshold=-1 does not result in an incompatibility issues. When manually changing the threshold (e.g. `uniform_cv_threshold=1000`), I see `bad block contents`, which is expected ## Benchmark readrandom with `fillrandom,compact -seed=1 --statistics`: | Benchmark | Branch | Params | avg ops/s | % change vs main | CV P50 | |-----------|--------|--------|-----------|------------------|--------| | readrandom | main | `binary_search, shortening=1` | 335,791 | baseline | N/A | | readrandom | feature | `binary_search, shortening=1` (default) | 335,749 | -0.0% | 1,500 | | readrandom | feature | `auto_search, shortening=1` (kAuto) | 366,832 | **+9.2%** | 1,500 | | readrandom | feature | `interpolation_search, shortening=1` | 366,598 | **+9.2%** | 1,500 | | readrandom | feature | `auto_search, shortening=2` (kAuto) | 344,631 | **+2.6%** | 1,030,000 | | readrandom | feature | `interpolation_search, shortening=2` | 201,178 | **-40.1%** | 1,030,000 | As seen with shortening=2, a non-uniform distribution produces a high CV, which does not use interpolation search. ## Write benchmark There is a write overhead which scans each restart entry for a block upon Finish. In practice this is very low because currently it is only applied to index blocks. See cpu profile (https://fburl.com/strobelight/io5hwj9h) here of `-benchmarks=fillseq,compact -compression_type=none -disable_wal=1`. Only 0.08% attributed to `ScanForUniformity`. Reviewed By: pdillinger Differential Revision: D94738890 Pulled By: joshkang97 fbshipit-source-id: 9661ac593c5fef89d49f3a8a027f1338a0c96766
158 lines
7.1 KiB
C++
158 lines
7.1 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// This file implements the "bridge" between Java and C++ for
|
|
// ROCKSDB_NAMESPACE::Options.
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include <jni.h>
|
|
|
|
#include "include/org_rocksdb_BlockBasedTableConfig.h"
|
|
#include "include/org_rocksdb_PlainTableConfig.h"
|
|
#include "portal.h"
|
|
#include "rocksdb/cache.h"
|
|
#include "rocksdb/filter_policy.h"
|
|
#include "rocksjni/cplusplus_to_java_convert.h"
|
|
|
|
/*
|
|
* Class: org_rocksdb_PlainTableConfig
|
|
* Method: newTableFactoryHandle
|
|
* Signature: (IIDIIBZZ)J
|
|
*/
|
|
jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
|
|
JNIEnv* /*env*/, jclass /*jcls*/, jint jkey_size, jint jbloom_bits_per_key,
|
|
jdouble jhash_table_ratio, jint jindex_sparseness, jint jhuge_page_tlb_size,
|
|
jbyte jencoding_type, jboolean jfull_scan_mode,
|
|
jboolean jstore_index_in_file) {
|
|
ROCKSDB_NAMESPACE::PlainTableOptions options =
|
|
ROCKSDB_NAMESPACE::PlainTableOptions();
|
|
options.user_key_len = jkey_size;
|
|
options.bloom_bits_per_key = jbloom_bits_per_key;
|
|
options.hash_table_ratio = jhash_table_ratio;
|
|
options.index_sparseness = jindex_sparseness;
|
|
options.huge_page_tlb_size = jhuge_page_tlb_size;
|
|
options.encoding_type =
|
|
static_cast<ROCKSDB_NAMESPACE::EncodingType>(jencoding_type);
|
|
options.full_scan_mode = jfull_scan_mode;
|
|
options.store_index_in_file = jstore_index_in_file;
|
|
return GET_CPLUSPLUS_POINTER(
|
|
ROCKSDB_NAMESPACE::NewPlainTableFactory(options));
|
|
}
|
|
|
|
/*
|
|
* Class: org_rocksdb_BlockBasedTableConfig
|
|
* Method: newTableFactoryHandle
|
|
* Signature: (ZZZZBBDBZJJJIIIJZZZJZZIIZZJJBBJI)J
|
|
*/
|
|
jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
|
|
JNIEnv*, jclass, jboolean jcache_index_and_filter_blocks,
|
|
jboolean jcache_index_and_filter_blocks_with_high_priority,
|
|
jboolean jpin_l0_filter_and_index_blocks_in_cache,
|
|
jboolean jpin_top_level_index_and_filter, jbyte jindex_type_value,
|
|
jbyte jdata_block_index_type_value,
|
|
jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value,
|
|
jboolean jno_block_cache, jlong jblock_cache_handle,
|
|
jlong jpersistent_cache_handle, jlong jblock_size,
|
|
jint jblock_size_deviation, jint jblock_restart_interval,
|
|
jint jindex_block_restart_interval, jlong jmetadata_block_size,
|
|
jboolean jpartition_filters, jboolean joptimize_filters_for_memory,
|
|
jboolean juse_delta_encoding, jlong jfilter_policy_handle,
|
|
jboolean jwhole_key_filtering, jboolean jverify_compression,
|
|
jint jread_amp_bytes_per_bit, jint jformat_version,
|
|
jboolean jseparate_key_value_in_data_block, jdouble juniform_cv_threshold,
|
|
jboolean jenable_index_compression, jboolean jblock_align,
|
|
jlong jsuper_block_alignment_size,
|
|
jlong jsuper_block_alignment_space_overhead_ratio, jbyte jindex_shortening,
|
|
jbyte jindex_search_type, jlong jblock_cache_size,
|
|
jint jblock_cache_num_shard_bits) {
|
|
ROCKSDB_NAMESPACE::BlockBasedTableOptions options;
|
|
options.cache_index_and_filter_blocks =
|
|
static_cast<bool>(jcache_index_and_filter_blocks);
|
|
options.cache_index_and_filter_blocks_with_high_priority =
|
|
static_cast<bool>(jcache_index_and_filter_blocks_with_high_priority);
|
|
options.pin_l0_filter_and_index_blocks_in_cache =
|
|
static_cast<bool>(jpin_l0_filter_and_index_blocks_in_cache);
|
|
options.pin_top_level_index_and_filter =
|
|
static_cast<bool>(jpin_top_level_index_and_filter);
|
|
options.index_type =
|
|
ROCKSDB_NAMESPACE::IndexTypeJni::toCppIndexType(jindex_type_value);
|
|
options.data_block_index_type =
|
|
ROCKSDB_NAMESPACE::DataBlockIndexTypeJni::toCppDataBlockIndexType(
|
|
jdata_block_index_type_value);
|
|
options.data_block_hash_table_util_ratio =
|
|
static_cast<double>(jdata_block_hash_table_util_ratio);
|
|
options.checksum = ROCKSDB_NAMESPACE::ChecksumTypeJni::toCppChecksumType(
|
|
jchecksum_type_value);
|
|
options.no_block_cache = static_cast<bool>(jno_block_cache);
|
|
if (options.no_block_cache) {
|
|
options.block_cache = nullptr;
|
|
} else {
|
|
if (jblock_cache_handle > 0) {
|
|
std::shared_ptr<ROCKSDB_NAMESPACE::Cache>* pCache =
|
|
reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::Cache>*>(
|
|
jblock_cache_handle);
|
|
options.block_cache = *pCache;
|
|
} else if (jblock_cache_size >= 0) {
|
|
if (jblock_cache_num_shard_bits > 0) {
|
|
options.block_cache = ROCKSDB_NAMESPACE::NewLRUCache(
|
|
static_cast<size_t>(jblock_cache_size),
|
|
static_cast<int>(jblock_cache_num_shard_bits));
|
|
} else {
|
|
options.block_cache = ROCKSDB_NAMESPACE::NewLRUCache(
|
|
static_cast<size_t>(jblock_cache_size));
|
|
}
|
|
} else {
|
|
options.no_block_cache = true;
|
|
options.block_cache = nullptr;
|
|
}
|
|
}
|
|
if (jpersistent_cache_handle > 0) {
|
|
std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>* pCache =
|
|
reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::PersistentCache>*>(
|
|
jpersistent_cache_handle);
|
|
options.persistent_cache = *pCache;
|
|
}
|
|
options.block_size = static_cast<size_t>(jblock_size);
|
|
options.block_size_deviation = static_cast<int>(jblock_size_deviation);
|
|
options.block_restart_interval = static_cast<int>(jblock_restart_interval);
|
|
options.index_block_restart_interval =
|
|
static_cast<int>(jindex_block_restart_interval);
|
|
options.metadata_block_size = static_cast<uint64_t>(jmetadata_block_size);
|
|
options.partition_filters = static_cast<bool>(jpartition_filters);
|
|
options.optimize_filters_for_memory =
|
|
static_cast<bool>(joptimize_filters_for_memory);
|
|
options.use_delta_encoding = static_cast<bool>(juse_delta_encoding);
|
|
if (jfilter_policy_handle > 0) {
|
|
std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>* pFilterPolicy =
|
|
reinterpret_cast<std::shared_ptr<ROCKSDB_NAMESPACE::FilterPolicy>*>(
|
|
jfilter_policy_handle);
|
|
options.filter_policy = *pFilterPolicy;
|
|
}
|
|
options.whole_key_filtering = static_cast<bool>(jwhole_key_filtering);
|
|
options.verify_compression = static_cast<bool>(jverify_compression);
|
|
options.read_amp_bytes_per_bit =
|
|
static_cast<uint32_t>(jread_amp_bytes_per_bit);
|
|
options.format_version = static_cast<uint32_t>(jformat_version);
|
|
options.separate_key_value_in_data_block =
|
|
static_cast<bool>(jseparate_key_value_in_data_block);
|
|
options.uniform_cv_threshold = static_cast<double>(juniform_cv_threshold);
|
|
options.enable_index_compression =
|
|
static_cast<bool>(jenable_index_compression);
|
|
options.block_align = static_cast<bool>(jblock_align);
|
|
options.super_block_alignment_size =
|
|
static_cast<size_t>(jsuper_block_alignment_size);
|
|
options.super_block_alignment_space_overhead_ratio =
|
|
static_cast<size_t>(jsuper_block_alignment_space_overhead_ratio);
|
|
options.index_shortening =
|
|
ROCKSDB_NAMESPACE::IndexShorteningModeJni::toCppIndexShorteningMode(
|
|
jindex_shortening);
|
|
options.index_block_search_type =
|
|
ROCKSDB_NAMESPACE::IndexSearchTypeJni::toCppIndexSearchType(
|
|
jindex_search_type);
|
|
|
|
return GET_CPLUSPLUS_POINTER(
|
|
ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(options));
|
|
}
|