Summary: Introduce new table option with separated key-value storage in data blocks. This PR implements a new SST block format where keys and values are stored in separate sections within data blocks, rather than interleaved. Keys are stored first, followed by all values in a contiguous section. The motivation is better cpu cache hit rate during seeks and potentially better compression. The additional storage cost is a varint per restart point, and 4 bytes additional block footer. For a data block with a restart interval of 16, it is approximately 1 bit of overhead per entry. But compression actually performs better, resulting in ~3% storage savings from benchmark. For now I've opted to not separate kvs in non-data blocks since restart interval for those blocks is typically 1, and values are typically small and probably better inlined. ### New block layout ``` +------------------+ | Keys Section | <- Key entries with delta encoding +------------------+ | Values Section | <- (new) Values stored contiguously +------------------+ | Restart Array | <- Fixed32 offsets to restart points +------------------+ | Values Offset | <- (new) 4 bytes: offset to values section | Footer | <- 4 bytes: packed index_type + num_restarts +------------------+ ``` ### Entry Format **At restart points** ``` +--------------+------------------+----------------+-----------------+-----------+ | shared (v32) | non_shared (v32) | value_sz (v32) | value_off (v32) | key_delta | +--------------+------------------+----------------+-----------------+-----------+ ``` **At non-restart points** ``` +--------------+------------------+----------------+-----------+ | shared (v32) | non_shared (v32) | value_sz (v32) | key_delta | +--------------+------------------+----------------+-----------+ ``` - `value_offset` is only stored at restart points to save space - For non-restart entries, value offset is computed as: `prev_value_offset + prev_value_size` ### Forward Compatibility - We make use of reserved block footer bits to mark if a block has separated kv format. Should an older version read this, it will assume a very large block restart interval and result in a corruption error. ### Key Changes - **BlockBuilder**: Accumulates values in a separate buffer; value offsets are stored only at restart points (other entries derive offset from previous value's position). There is an additional memcpy cost to place the value data after the key data. - **Block iteration**: Iteration now needs to know if we are at a restart point. This will rely on `cur_entry_idx_`, which was previously only used for per-kv checksum purposes. In this new format, we also need to know the block_restart_interval, which was previously also only calculated for per-kv checksums. - **Table properties**: Store `data_block_restart_interval`, `index_block_restart_interval`, and `separate_key_value_in_data_block` in table properties Pull Request resolved: https://github.com/facebook/rocksdb/pull/14287 Test Plan: - Extended block_test, table_test, compaction_test to contain new separated_kv param - Added new parameter to crash test --- ## Benchmark ### Varying Value Size **Write:** `db_bench --num=1000000 --separate_key_value_in_data_block=<bool> --value_size=<X> --benchmarks=fillrandom,compact` **Read:** `db_bench --db=$DB --use_existing_db --benchmarks=readrandom,readseq` | Value Size | FillRandom (ops/s) | | Compact (s) | | ReadRandom (ops/s) | | ReadSeq (ops/s) | | SST Size (MB) | | |----------:|-------------------:|-----:|----------:|-----:|-------------------:|-----:|----------------:|-----:|-------------:|-----:| | | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | | 1 | 253,280 | 264,977 | 0.516 | 0.497 (**-3.7%**) | 596,328 | 586,625 (**-1.6%**) | 5,904,681 | 6,069,581 (**+2.8%**) | 5.1 | 4.2 (**-18.6%**) | | 16 | 235,757 | 242,367 | 0.572 | 0.533 (**-6.8%**) | 570,751 | 572,815 (**+0.4%**) | 5,371,138 | 5,604,908 (**+4.4%**) | 14.7 | 13.4 (**-8.5%**) | | 100 | 264,299 | 265,427 | 0.461 | 0.454 (**-1.5%**) | 323,696 | 332,790 (**+2.8%**) | 4,239,725 | 4,232,416 (**-0.2%**) | 38.8 | 37.6 (**-3.2%**) | | 1,000 | 238,992 | 242,764 | 2.349 | 2.329 (**-0.9%**) | 244,608 | 261,403 (**+6.9%**) | 1,285,394 | 1,265,868 (**-1.5%**) | 342.1 | 342.0 (**-0.0%**) | ### Varying Block Restart Interval **Write:** `db_bench --num=1000000 --separate_key_value_in_data_block=<bool> --block_restart_interval=<X> --benchmarks=fillrandom,compact` **Read:** `db_bench --db=$DB --use_existing_db --benchmarks=readrandom,readseq` | BRI | FillRandom (ops/s) | | Compact (s) | | ReadRandom (ops/s) | | ReadSeq (ops/s) | | SST Size (MB) | | |----:|-------------------:|-----:|----------:|-----:|-------------------:|-----:|----------------:|-----:|-------------:|-----:| | | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | | 1 | 251,654 | 263,707 | 0.453 | 0.485 (**+7.1%**) | 334,653 | 328,708 (**-1.8%**) | 4,194,342 | 3,954,291 (**-5.7%**) | 40.6 | 42.4 (**+4.5%**) | | 4 | 253,797 | 252,394 | 0.476 | 0.476 (**+0.0%**) | 332,719 | 341,676 (**+2.7%**) | 4,135,691 | 4,051,151 (**-2.0%**) | 39.2 | 39.3 (**+0.3%**) | | 8 | 260,143 | 262,273 | 0.496 | 0.460 (**-7.3%**) | 330,859 | 337,567 (**+2.0%**) | 4,144,081 | 4,187,389 (**+1.0%**) | 38.9 | 38.1 (**-2.1%**) | | 16 | 252,875 | 263,176 | 0.464 | 0.455 (**-1.9%**) | 323,783 | 335,418 (**+3.6%**) | 4,127,310 | 4,217,028 (**+2.2%**) | 38.8 | 37.6 (**-3.2%**) | | 32 | 260,224 | 269,422 | 0.464 | 0.451 (**-2.8%**) | 304,001 | 314,989 (**+3.6%**) | 4,310,162 | 4,247,248 (**-1.5%**) | 38.8 | 37.3 (**-3.8%**) | ### Varying Compression **Write:** `db_bench --num=1000000 --separate_key_value_in_data_block=<bool> --compression_type=<X> [--compression_level=<N>] --benchmarks=fillrandom,compact` **Read:** `db_bench --db=$DB --use_existing_db --benchmarks=readrandom,readseq` | Compression | FillRandom (ops/s) | | Compact (s) | | ReadRandom (ops/s) | | ReadSeq (ops/s) | | SST Size (MB) | | |------------|-------------------:|-----:|----------:|-----:|-------------------:|-----:|----------------:|-----:|-------------:|-----:| | | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | | None | 252,494 | 260,552 | 0.413 | 0.419 (**+1.5%**) | 356,290 | 371,535 (**+4.3%**) | 4,479,261 | 4,507,133 (**+0.6%**) | 73.5 | 73.6 (**+0.2%**) | | LZ4 | 246,010 | 256,360 | 0.477 | 0.455 (**-4.6%**) | 342,497 | 345,882 (**+1.0%**) | 4,400,570 | 4,268,102 (**-3.0%**) | 38.3 | 37.6 (**-2.0%**) | | ZSTD (L3) | 254,748 | 258,556 | 1.067 | 1.055 (**-1.1%**) | 176,724 | 177,566 (**+0.5%**) | 2,736,841 | 2,717,739 (**-0.7%**) | 32.9 | 31.3 (**-4.7%**) | | ZSTD (L6) | 256,459 | 259,388 | 1.556 | 1.462 (**-6.0%**) | 177,390 | 176,691 (**-0.4%**) | 2,754,336 | 2,688,682 (**-2.4%**) | 32.8 | 31.1 (**-5.1%**) | ### Varying Block Size **Write:** `db_bench --num=1000000 --separate_key_value_in_data_block=<bool> --block_size=<X> --benchmarks=fillrandom,compact` **Read:** `db_bench --db=$DB --use_existing_db --benchmarks=readrandom,readseq` | Block Size | FillRandom (ops/s) | | Compact (s) | | ReadRandom (ops/s) | | ReadSeq (ops/s) | | SST Size (MB) | | |-----------:|-------------------:|-----:|----------:|-----:|-------------------:|-----:|----------------:|-----:|-------------:|-----:| | | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | | 4 KB | 263,203 | 260,362 | 0.469 | 0.461 (**-1.7%**) | 324,178 | 332,249 (**+2.5%**) | 4,231,537 | 4,217,763 (**-0.3%**) | 38.8 | 37.6 (**-3.1%**) | | 16 KB | 252,742 | 263,161 | 0.426 | 0.428 (**+0.5%**) | 227,805 | 222,873 (**-2.2%**) | 5,146,997 | 5,081,080 (**-1.3%**) | 38.1 | 36.7 (**-3.6%**) | | 64 KB | 257,490 | 260,225 | 0.423 | 0.414 (**-2.1%**) | 86,807 | 91,586 (**+5.5%**) | 5,380,403 | 5,372,372 (**-0.1%**) | 36.3 | 35.0 (**-3.5%**) | ### Varying Key Size **Write:** `db_bench --num=1000000 --separate_key_value_in_data_block=<bool> --min_key_size=10 --max_key_size=100 --benchmarks=fillrandom,compact` **Read:** `db_bench --db=$DB --use_existing_db --benchmarks=readrandom,readseq` | Key Size | FillRandom (ops/s) | | Compact (s) | | ReadRandom (ops/s) | | ReadSeq (ops/s) | | SST Size (MB) | | |---------:|-------------------:|-----:|----------:|-----:|-------------------:|-----:|----------------:|-----:|-------------:|-----:| | | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | baseline | sep_kv | | 10–100 | 243,740 | 255,183 | 0.618 | 0.622 (**+0.6%**) | 284,304 | 307,569 (**+8.2%**) | 3,738,921 | 3,686,676 (**-1.4%**) | 41.5 | 41.2 (**-0.8%**) | ### CPU Profile Notes - No compression: DataBlock::SeekForGet uses less cpu (13.2% vs 13.9%) - https://fburl.com/strobelight/6mwwebft with separated KV - https://fburl.com/strobelight/m9m798ka without - ZSTD compression: rocksdb::DecompressSerializedBlock uses more CPU (45.8% vs 44.9%), while DataBlock::SeekForGet uses less cpu (5.09% vs 6.52%) - https://fburl.com/strobelight/3x5nw1k4 with separated KV - https://fburl.com/strobelight/e7809046 without --- Reviewed By: xingbowang, pdillinger Differential Revision: D92103024 Pulled By: joshkang97 fbshipit-source-id: 47cfeb656ff3c20d34975f0b6c4c0462935a83dc
670 lines
27 KiB
C++
670 lines
27 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
#include "table/meta_blocks.h"
|
|
|
|
#include <map>
|
|
#include <string>
|
|
|
|
#include "block_fetcher.h"
|
|
#include "db/table_properties_collector.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "logging/logging.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/table.h"
|
|
#include "rocksdb/table_properties.h"
|
|
#include "table/block_based/block.h"
|
|
#include "table/block_based/reader_common.h"
|
|
#include "table/format.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/persistent_cache_helper.h"
|
|
#include "table/sst_file_writer_collectors.h"
|
|
#include "table/table_properties_internal.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "util/coding.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
const std::string kPropertiesBlockName = "rocksdb.properties";
|
|
// NB: only used with format_version >= 6
|
|
const std::string kIndexBlockName = "rocksdb.index";
|
|
const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
|
|
const std::string kRangeDelBlockName = "rocksdb.range_del";
|
|
|
|
MetaIndexBuilder::MetaIndexBuilder()
|
|
: meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
|
|
|
|
void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) {
|
|
std::string handle_encoding;
|
|
handle.EncodeTo(&handle_encoding);
|
|
meta_block_handles_.insert({key, handle_encoding});
|
|
}
|
|
|
|
Slice MetaIndexBuilder::Finish() {
|
|
for (const auto& metablock : meta_block_handles_) {
|
|
meta_index_block_->Add(metablock.first, metablock.second);
|
|
}
|
|
return meta_index_block_->Finish();
|
|
}
|
|
|
|
// Property block will be read sequentially and cached in a heap located
|
|
// object, so there's no need for restart points. Thus we set the restart
|
|
// interval to infinity to save space.
|
|
PropertyBlockBuilder::PropertyBlockBuilder()
|
|
: properties_block_(new BlockBuilder(
|
|
std::numeric_limits<int32_t>::max() /* restart interval */)) {}
|
|
|
|
void PropertyBlockBuilder::Add(const std::string& name,
|
|
const std::string& val) {
|
|
assert(props_.find(name) == props_.end());
|
|
props_.insert({name, val});
|
|
}
|
|
|
|
void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
|
|
std::string dst;
|
|
PutVarint64(&dst, val);
|
|
|
|
Add(name, dst);
|
|
}
|
|
|
|
void PropertyBlockBuilder::Add(
|
|
const UserCollectedProperties& user_collected_properties) {
|
|
for (const auto& prop : user_collected_properties) {
|
|
Add(prop.first, prop.second);
|
|
}
|
|
}
|
|
|
|
void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
|
TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
|
|
const_cast<TableProperties*>(&props));
|
|
|
|
Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number);
|
|
Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
|
|
Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
|
|
Add(TablePropertiesNames::kDataSize, props.data_size);
|
|
Add(TablePropertiesNames::kIndexSize, props.index_size);
|
|
if (props.index_partitions != 0) {
|
|
Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
|
|
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
|
|
}
|
|
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
|
|
Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
|
props.index_value_is_delta_encoded);
|
|
Add(TablePropertiesNames::kNumEntries, props.num_entries);
|
|
Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
|
|
Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
|
|
Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
|
|
Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
|
|
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
|
|
Add(TablePropertiesNames::kFilterSize, props.filter_size);
|
|
Add(TablePropertiesNames::kFormatVersion, props.format_version);
|
|
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
|
|
Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
|
|
Add(TablePropertiesNames::kCreationTime, props.creation_time);
|
|
Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
|
|
Add(TablePropertiesNames::kNewestKeyTime, props.newest_key_time);
|
|
if (props.file_creation_time > 0) {
|
|
Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
|
|
}
|
|
if (props.slow_compression_estimated_data_size > 0) {
|
|
Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
props.slow_compression_estimated_data_size);
|
|
}
|
|
if (props.fast_compression_estimated_data_size > 0) {
|
|
Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
props.fast_compression_estimated_data_size);
|
|
}
|
|
Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset);
|
|
if (props.user_defined_timestamps_persisted == 0) {
|
|
Add(TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
props.user_defined_timestamps_persisted);
|
|
}
|
|
if (!props.db_id.empty()) {
|
|
Add(TablePropertiesNames::kDbId, props.db_id);
|
|
}
|
|
if (!props.db_session_id.empty()) {
|
|
Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
|
|
}
|
|
if (!props.db_host_id.empty()) {
|
|
Add(TablePropertiesNames::kDbHostId, props.db_host_id);
|
|
}
|
|
|
|
if (!props.filter_policy_name.empty()) {
|
|
Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
|
|
}
|
|
if (!props.comparator_name.empty()) {
|
|
Add(TablePropertiesNames::kComparator, props.comparator_name);
|
|
}
|
|
|
|
if (!props.merge_operator_name.empty()) {
|
|
Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name);
|
|
}
|
|
if (!props.prefix_extractor_name.empty()) {
|
|
Add(TablePropertiesNames::kPrefixExtractorName,
|
|
props.prefix_extractor_name);
|
|
}
|
|
if (!props.property_collectors_names.empty()) {
|
|
Add(TablePropertiesNames::kPropertyCollectors,
|
|
props.property_collectors_names);
|
|
}
|
|
if (!props.column_family_name.empty()) {
|
|
Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name);
|
|
}
|
|
|
|
if (!props.compression_name.empty()) {
|
|
Add(TablePropertiesNames::kCompression, props.compression_name);
|
|
}
|
|
if (!props.compression_options.empty()) {
|
|
Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
|
|
}
|
|
if (!props.seqno_to_time_mapping.empty()) {
|
|
Add(TablePropertiesNames::kSequenceNumberTimeMapping,
|
|
props.seqno_to_time_mapping);
|
|
}
|
|
if (props.key_largest_seqno != UINT64_MAX) {
|
|
Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
|
|
}
|
|
if (props.key_smallest_seqno != UINT64_MAX) {
|
|
Add(TablePropertiesNames::kKeySmallestSeqno, props.key_smallest_seqno);
|
|
}
|
|
if (props.data_block_restart_interval > 0) {
|
|
Add(TablePropertiesNames::kDataBlockRestartInterval,
|
|
props.data_block_restart_interval);
|
|
}
|
|
if (props.index_block_restart_interval > 0) {
|
|
Add(TablePropertiesNames::kIndexBlockRestartInterval,
|
|
props.index_block_restart_interval);
|
|
}
|
|
if (props.separate_key_value_in_data_block > 0) {
|
|
Add(TablePropertiesNames::kSeparateKeyValueInDataBlock,
|
|
props.separate_key_value_in_data_block);
|
|
}
|
|
}
|
|
|
|
Slice PropertyBlockBuilder::Finish() {
|
|
for (const auto& prop : props_) {
|
|
assert(last_prop_added_to_block_.empty() ||
|
|
comparator_->Compare(prop.first, last_prop_added_to_block_) > 0);
|
|
properties_block_->Add(prop.first, prop.second);
|
|
#ifndef NDEBUG
|
|
last_prop_added_to_block_ = prop.first;
|
|
#endif /* !NDEBUG */
|
|
}
|
|
|
|
return properties_block_->Finish();
|
|
}
|
|
|
|
void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
|
|
const std::string& name) {
|
|
assert(method == "Add" || method == "Finish");
|
|
|
|
std::string msg =
|
|
"Encountered error when calling TablePropertiesCollector::" + method +
|
|
"() with collector name: " + name;
|
|
ROCKS_LOG_ERROR(info_log, "%s", msg.c_str());
|
|
}
|
|
|
|
bool NotifyCollectTableCollectorsOnAdd(
|
|
const Slice& key, const Slice& value, uint64_t file_size,
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
Logger* info_log) {
|
|
bool all_succeeded = true;
|
|
for (auto& collector : collectors) {
|
|
Status s = collector->InternalAdd(key, value, file_size);
|
|
all_succeeded = all_succeeded && s.ok();
|
|
if (!s.ok()) {
|
|
LogPropertiesCollectionError(info_log, "Add" /* method */,
|
|
collector->Name());
|
|
}
|
|
}
|
|
return all_succeeded;
|
|
}
|
|
|
|
void NotifyCollectTableCollectorsOnBlockAdd(
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
const uint64_t block_uncomp_bytes,
|
|
const uint64_t block_compressed_bytes_fast,
|
|
const uint64_t block_compressed_bytes_slow) {
|
|
for (auto& collector : collectors) {
|
|
collector->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
|
|
block_compressed_bytes_slow);
|
|
}
|
|
}
|
|
|
|
bool NotifyCollectTableCollectorsOnFinish(
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
Logger* info_log, PropertyBlockBuilder* builder,
|
|
UserCollectedProperties& user_collected_properties,
|
|
UserCollectedProperties& readable_properties) {
|
|
bool all_succeeded = true;
|
|
for (auto& collector : collectors) {
|
|
UserCollectedProperties user_properties;
|
|
Status s = collector->Finish(&user_properties);
|
|
if (s.ok()) {
|
|
for (const auto& prop : collector->GetReadableProperties()) {
|
|
readable_properties.insert(prop);
|
|
}
|
|
#ifndef NDEBUG
|
|
// Check different user properties collectors are not adding properties of
|
|
// the same name.
|
|
for (const auto& pair : user_properties) {
|
|
assert(user_collected_properties.find(pair.first) ==
|
|
user_collected_properties.end());
|
|
}
|
|
#endif /* !NDEBUG */
|
|
user_collected_properties.merge(user_properties);
|
|
} else {
|
|
LogPropertiesCollectionError(info_log, "Finish" /* method */,
|
|
collector->Name());
|
|
if (all_succeeded) {
|
|
all_succeeded = false;
|
|
}
|
|
}
|
|
}
|
|
builder->Add(user_collected_properties);
|
|
return all_succeeded;
|
|
}
|
|
|
|
Status ParsePropertiesBlock(
|
|
const ImmutableOptions& ioptions, uint64_t offset, Block& properties_block,
|
|
std::unique_ptr<TableProperties>& new_table_properties) {
|
|
std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
|
|
|
|
// All pre-defined properties of type uint64_t
|
|
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
|
{TablePropertiesNames::kOriginalFileNumber,
|
|
&new_table_properties->orig_file_number},
|
|
{TablePropertiesNames::kDataSize, &new_table_properties->data_size},
|
|
{TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
|
|
{TablePropertiesNames::kIndexPartitions,
|
|
&new_table_properties->index_partitions},
|
|
{TablePropertiesNames::kTopLevelIndexSize,
|
|
&new_table_properties->top_level_index_size},
|
|
{TablePropertiesNames::kIndexKeyIsUserKey,
|
|
&new_table_properties->index_key_is_user_key},
|
|
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
|
&new_table_properties->index_value_is_delta_encoded},
|
|
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
|
|
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
|
|
{TablePropertiesNames::kRawValueSize,
|
|
&new_table_properties->raw_value_size},
|
|
{TablePropertiesNames::kNumDataBlocks,
|
|
&new_table_properties->num_data_blocks},
|
|
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
|
|
{TablePropertiesNames::kNumFilterEntries,
|
|
&new_table_properties->num_filter_entries},
|
|
{TablePropertiesNames::kDeletedKeys,
|
|
&new_table_properties->num_deletions},
|
|
{TablePropertiesNames::kMergeOperands,
|
|
&new_table_properties->num_merge_operands},
|
|
{TablePropertiesNames::kNumRangeDeletions,
|
|
&new_table_properties->num_range_deletions},
|
|
{TablePropertiesNames::kFormatVersion,
|
|
&new_table_properties->format_version},
|
|
{TablePropertiesNames::kFixedKeyLen,
|
|
&new_table_properties->fixed_key_len},
|
|
{TablePropertiesNames::kColumnFamilyId,
|
|
&new_table_properties->column_family_id},
|
|
{TablePropertiesNames::kCreationTime,
|
|
&new_table_properties->creation_time},
|
|
{TablePropertiesNames::kOldestKeyTime,
|
|
&new_table_properties->oldest_key_time},
|
|
{TablePropertiesNames::kNewestKeyTime,
|
|
&new_table_properties->newest_key_time},
|
|
{TablePropertiesNames::kFileCreationTime,
|
|
&new_table_properties->file_creation_time},
|
|
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
&new_table_properties->slow_compression_estimated_data_size},
|
|
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
&new_table_properties->fast_compression_estimated_data_size},
|
|
{TablePropertiesNames::kTailStartOffset,
|
|
&new_table_properties->tail_start_offset},
|
|
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
&new_table_properties->user_defined_timestamps_persisted},
|
|
{TablePropertiesNames::kKeyLargestSeqno,
|
|
&new_table_properties->key_largest_seqno},
|
|
{TablePropertiesNames::kKeySmallestSeqno,
|
|
&new_table_properties->key_smallest_seqno},
|
|
{TablePropertiesNames::kDataBlockRestartInterval,
|
|
&new_table_properties->data_block_restart_interval},
|
|
{TablePropertiesNames::kIndexBlockRestartInterval,
|
|
&new_table_properties->index_block_restart_interval},
|
|
{TablePropertiesNames::kSeparateKeyValueInDataBlock,
|
|
&new_table_properties->separate_key_value_in_data_block},
|
|
};
|
|
|
|
Status s;
|
|
std::string last_key;
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
s = iter->status();
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
|
|
auto key = iter->key().ToString();
|
|
// properties block should be strictly sorted with no duplicate key.
|
|
if (!last_key.empty() &&
|
|
BytewiseComparator()->Compare(key, last_key) <= 0) {
|
|
s = Status::Corruption("properties unsorted");
|
|
break;
|
|
}
|
|
last_key = key;
|
|
|
|
auto raw_val = iter->value();
|
|
auto pos = predefined_uint64_properties.find(key);
|
|
|
|
if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
|
|
new_table_properties->external_sst_file_global_seqno_offset =
|
|
offset + iter->ValueOffset();
|
|
}
|
|
|
|
if (pos != predefined_uint64_properties.end()) {
|
|
if (key == TablePropertiesNames::kDeletedKeys ||
|
|
key == TablePropertiesNames::kMergeOperands) {
|
|
// Insert in user-collected properties for API backwards compatibility
|
|
new_table_properties->user_collected_properties.insert(
|
|
{key, raw_val.ToString()});
|
|
}
|
|
// handle predefined rocksdb properties
|
|
uint64_t val;
|
|
if (!GetVarint64(&raw_val, &val)) {
|
|
// skip malformed value
|
|
auto error_msg =
|
|
"Detect malformed value in properties meta-block:"
|
|
"\tkey: " +
|
|
key + "\tval: " + raw_val.ToString();
|
|
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
|
continue;
|
|
}
|
|
*(pos->second) = val;
|
|
} else if (key == TablePropertiesNames::kDbId) {
|
|
new_table_properties->db_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kDbSessionId) {
|
|
new_table_properties->db_session_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kDbHostId) {
|
|
new_table_properties->db_host_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
|
new_table_properties->filter_policy_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
|
new_table_properties->column_family_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kComparator) {
|
|
new_table_properties->comparator_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kMergeOperator) {
|
|
new_table_properties->merge_operator_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
|
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
|
new_table_properties->property_collectors_names = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kCompression) {
|
|
new_table_properties->compression_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
|
new_table_properties->compression_options = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
|
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
|
} else {
|
|
// handle user-collected properties
|
|
new_table_properties->user_collected_properties.insert(
|
|
{key, raw_val.ToString()});
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// FIXME: should be a parameter for reading table properties to use persistent
|
|
// cache?
|
|
Status ReadTablePropertiesHelper(
|
|
const ReadOptions& ro, const BlockHandle& handle,
|
|
RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
|
|
const Footer& footer, const ImmutableOptions& ioptions,
|
|
std::unique_ptr<TableProperties>* table_properties,
|
|
MemoryAllocator* memory_allocator) {
|
|
assert(table_properties);
|
|
|
|
Status s;
|
|
bool retry = false;
|
|
while (true) {
|
|
BlockContents block_contents;
|
|
size_t len = handle.size() + footer.GetBlockTrailerSize();
|
|
// If this is an external SST file ingested with write_global_seqno set to
|
|
// true, then we expect the checksum mismatch because checksum was written
|
|
// by SstFileWriter, but its global seqno in the properties block may have
|
|
// been changed during ingestion. For this reason, we initially read
|
|
// and process without checksum verification, then later try checksum
|
|
// verification so that if it fails, we can copy to a temporary buffer with
|
|
// global seqno set to its original value, i.e. 0, and attempt checksum
|
|
// verification again.
|
|
if (!retry) {
|
|
ReadOptions modified_ro = ro;
|
|
modified_ro.verify_checksums = false;
|
|
BlockFetcher block_fetcher(
|
|
file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
|
|
ioptions, false /* decompress */, false /*maybe_compressed*/,
|
|
BlockType::kProperties, nullptr /*decompressor*/,
|
|
PersistentCacheOptions::kEmpty, memory_allocator);
|
|
s = block_fetcher.ReadBlockContents();
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
assert(block_fetcher.GetBlockSizeWithTrailer() == len);
|
|
TEST_SYNC_POINT_CALLBACK("ReadTablePropertiesHelper:0",
|
|
&block_contents.data);
|
|
} else {
|
|
assert(s.IsCorruption());
|
|
// If retrying, use a stronger file system read to check and correct
|
|
// data corruption
|
|
IOOptions opts;
|
|
IODebugContext dbg;
|
|
if (PrepareIOFromReadOptions(ro, ioptions.clock, opts, &dbg) !=
|
|
IOStatus::OK()) {
|
|
return s;
|
|
}
|
|
opts.verify_and_reconstruct_read = true;
|
|
std::unique_ptr<char[]> data(new char[len]);
|
|
Slice result;
|
|
IOStatus io_s = file->Read(opts, handle.offset(), len, &result,
|
|
data.get(), nullptr, &dbg);
|
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
if (!io_s.ok()) {
|
|
ROCKS_LOG_INFO(ioptions.info_log,
|
|
"Reading properties block failed - %s",
|
|
io_s.ToString().c_str());
|
|
// Return the original corruption error as that's more serious
|
|
return s;
|
|
}
|
|
if (result.size() < len) {
|
|
return Status::Corruption("Reading properties block failed - " +
|
|
std::to_string(result.size()) +
|
|
" bytes read");
|
|
}
|
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
block_contents = BlockContents(std::move(data), handle.size());
|
|
}
|
|
|
|
uint64_t block_size = block_contents.data.size();
|
|
Block properties_block(std::move(block_contents));
|
|
std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
|
|
s = ParsePropertiesBlock(ioptions, handle.offset(), properties_block,
|
|
new_table_properties);
|
|
|
|
// Modified version of BlockFetcher checksum verification
|
|
// (See write_global_seqno comment above)
|
|
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
|
|
s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
|
|
file->file_name(), handle.offset(),
|
|
BlockType::kProperties);
|
|
if (s.IsCorruption()) {
|
|
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
|
|
std::string tmp_buf(properties_block.data(), len);
|
|
uint64_t global_seqno_offset =
|
|
new_table_properties->external_sst_file_global_seqno_offset -
|
|
handle.offset();
|
|
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
|
|
s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
|
|
file->file_name(), handle.offset(),
|
|
BlockType::kProperties);
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we detected a corruption and the file system supports verification
|
|
// and reconstruction, retry the read
|
|
if (s.IsCorruption() && !retry &&
|
|
CheckFSFeatureSupport(ioptions.fs.get(),
|
|
FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
retry = true;
|
|
} else {
|
|
if (s.ok()) {
|
|
*table_properties = std::move(new_table_properties);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
|
|
uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
std::unique_ptr<TableProperties>* properties,
|
|
MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer) {
|
|
BlockHandle block_handle;
|
|
Footer footer;
|
|
Status s =
|
|
FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
|
|
read_options, kPropertiesBlockName, &block_handle,
|
|
memory_allocator, prefetch_buffer, &footer);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
if (!block_handle.IsNull()) {
|
|
s = ReadTablePropertiesHelper(read_options, block_handle, file,
|
|
prefetch_buffer, footer, ioptions, properties,
|
|
memory_allocator);
|
|
} else {
|
|
s = Status::NotFound();
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
|
|
const std::string& meta_block_name,
|
|
BlockHandle* block_handle) {
|
|
assert(block_handle != nullptr);
|
|
meta_index_iter->Seek(meta_block_name);
|
|
if (meta_index_iter->status().ok()) {
|
|
if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
|
|
Slice v = meta_index_iter->value();
|
|
return block_handle->DecodeFrom(&v);
|
|
}
|
|
}
|
|
// else
|
|
*block_handle = BlockHandle::NullBlockHandle();
|
|
return meta_index_iter->status();
|
|
}
|
|
|
|
Status FindMetaBlock(InternalIterator* meta_index_iter,
|
|
const std::string& meta_block_name,
|
|
BlockHandle* block_handle) {
|
|
Status s =
|
|
FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle);
|
|
if (s.ok() && block_handle->IsNull()) {
|
|
return Status::Corruption("Cannot find the meta block", meta_block_name);
|
|
} else {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
|
|
uint64_t file_size, uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
BlockContents* metaindex_contents,
|
|
MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer,
|
|
Footer* footer_out) {
|
|
Footer footer;
|
|
IOOptions opts;
|
|
IODebugContext dbg;
|
|
Status s;
|
|
s = file->PrepareIOOptions(read_options, opts, &dbg);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size,
|
|
&footer, table_magic_number, ioptions.stats);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
if (footer_out) {
|
|
*footer_out = footer;
|
|
}
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
return BlockFetcher(file, prefetch_buffer, footer, read_options,
|
|
metaindex_handle, metaindex_contents, ioptions,
|
|
false /* do decompression */, false /*maybe_compressed*/,
|
|
BlockType::kMetaIndex, nullptr /*decompressor*/,
|
|
PersistentCacheOptions::kEmpty, memory_allocator)
|
|
.ReadBlockContents();
|
|
}
|
|
|
|
Status FindMetaBlockInFile(
|
|
RandomAccessFileReader* file, uint64_t file_size,
|
|
uint64_t table_magic_number, const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options, const std::string& meta_block_name,
|
|
BlockHandle* block_handle, MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) {
|
|
BlockContents metaindex_contents;
|
|
auto s = ReadMetaIndexBlockInFile(
|
|
file, file_size, table_magic_number, ioptions, read_options,
|
|
&metaindex_contents, memory_allocator, prefetch_buffer, footer_out);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
// meta blocks are never compressed. Need to add uncompress logic if we are to
|
|
// compress it.
|
|
Block metaindex_block(std::move(metaindex_contents));
|
|
|
|
std::unique_ptr<InternalIterator> meta_iter;
|
|
meta_iter.reset(metaindex_block.NewMetaIterator());
|
|
|
|
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
|
|
}
|
|
|
|
Status ReadMetaBlock(RandomAccessFileReader* file,
|
|
FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
|
|
uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
const std::string& meta_block_name, BlockType block_type,
|
|
BlockContents* contents,
|
|
MemoryAllocator* memory_allocator) {
|
|
// TableProperties requires special handling because of checksum issues.
|
|
// Call ReadTableProperties instead for that case.
|
|
assert(block_type != BlockType::kProperties);
|
|
|
|
BlockHandle block_handle;
|
|
Footer footer;
|
|
Status status =
|
|
FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
|
|
read_options, meta_block_name, &block_handle,
|
|
memory_allocator, prefetch_buffer, &footer);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
|
|
return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle,
|
|
contents, ioptions, false /* decompress */,
|
|
false /*maybe_compressed*/, block_type,
|
|
nullptr /*decompressor*/, PersistentCacheOptions::kEmpty,
|
|
memory_allocator)
|
|
.ReadBlockContents();
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|