forked from continuwuation/rocksdb
Summary: when ingesting DB generated file with non-zero sequence number, we need smallest seqno of each file for file meta data. To avoid full table scan, we record this information in table property and use it during file ingestion. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13942 Test Plan: new unit test and updated existing unit test. Reviewed By: hx235 Differential Revision: D82331802 Pulled By: cbi42 fbshipit-source-id: 3009a6801ca7092cd0fde33692db1a13567068a9
660 lines
26 KiB
C++
660 lines
26 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
#include "table/meta_blocks.h"
|
|
|
|
#include <map>
|
|
#include <string>
|
|
|
|
#include "block_fetcher.h"
|
|
#include "db/table_properties_collector.h"
|
|
#include "file/random_access_file_reader.h"
|
|
#include "logging/logging.h"
|
|
#include "rocksdb/options.h"
|
|
#include "rocksdb/table.h"
|
|
#include "rocksdb/table_properties.h"
|
|
#include "table/block_based/block.h"
|
|
#include "table/block_based/reader_common.h"
|
|
#include "table/format.h"
|
|
#include "table/internal_iterator.h"
|
|
#include "table/persistent_cache_helper.h"
|
|
#include "table/sst_file_writer_collectors.h"
|
|
#include "table/table_properties_internal.h"
|
|
#include "test_util/sync_point.h"
|
|
#include "util/coding.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
const std::string kPropertiesBlockName = "rocksdb.properties";
|
|
// NB: only used with format_version >= 6
|
|
const std::string kIndexBlockName = "rocksdb.index";
|
|
// Old property block name for backward compatibility
|
|
const std::string kPropertiesBlockOldName = "rocksdb.stats";
|
|
const std::string kCompressionDictBlockName = "rocksdb.compression_dict";
|
|
const std::string kRangeDelBlockName = "rocksdb.range_del";
|
|
|
|
MetaIndexBuilder::MetaIndexBuilder()
|
|
: meta_index_block_(new BlockBuilder(1 /* restart interval */)) {}
|
|
|
|
void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) {
|
|
std::string handle_encoding;
|
|
handle.EncodeTo(&handle_encoding);
|
|
meta_block_handles_.insert({key, handle_encoding});
|
|
}
|
|
|
|
Slice MetaIndexBuilder::Finish() {
|
|
for (const auto& metablock : meta_block_handles_) {
|
|
meta_index_block_->Add(metablock.first, metablock.second);
|
|
}
|
|
return meta_index_block_->Finish();
|
|
}
|
|
|
|
// Property block will be read sequentially and cached in a heap located
|
|
// object, so there's no need for restart points. Thus we set the restart
|
|
// interval to infinity to save space.
|
|
PropertyBlockBuilder::PropertyBlockBuilder()
|
|
: properties_block_(new BlockBuilder(
|
|
std::numeric_limits<int32_t>::max() /* restart interval */)) {}
|
|
|
|
void PropertyBlockBuilder::Add(const std::string& name,
|
|
const std::string& val) {
|
|
assert(props_.find(name) == props_.end());
|
|
props_.insert({name, val});
|
|
}
|
|
|
|
void PropertyBlockBuilder::Add(const std::string& name, uint64_t val) {
|
|
std::string dst;
|
|
PutVarint64(&dst, val);
|
|
|
|
Add(name, dst);
|
|
}
|
|
|
|
void PropertyBlockBuilder::Add(
|
|
const UserCollectedProperties& user_collected_properties) {
|
|
for (const auto& prop : user_collected_properties) {
|
|
Add(prop.first, prop.second);
|
|
}
|
|
}
|
|
|
|
void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) {
|
|
TEST_SYNC_POINT_CALLBACK("PropertyBlockBuilder::AddTableProperty:Start",
|
|
const_cast<TableProperties*>(&props));
|
|
|
|
Add(TablePropertiesNames::kOriginalFileNumber, props.orig_file_number);
|
|
Add(TablePropertiesNames::kRawKeySize, props.raw_key_size);
|
|
Add(TablePropertiesNames::kRawValueSize, props.raw_value_size);
|
|
Add(TablePropertiesNames::kDataSize, props.data_size);
|
|
Add(TablePropertiesNames::kIndexSize, props.index_size);
|
|
if (props.index_partitions != 0) {
|
|
Add(TablePropertiesNames::kIndexPartitions, props.index_partitions);
|
|
Add(TablePropertiesNames::kTopLevelIndexSize, props.top_level_index_size);
|
|
}
|
|
Add(TablePropertiesNames::kIndexKeyIsUserKey, props.index_key_is_user_key);
|
|
Add(TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
|
props.index_value_is_delta_encoded);
|
|
Add(TablePropertiesNames::kNumEntries, props.num_entries);
|
|
Add(TablePropertiesNames::kNumFilterEntries, props.num_filter_entries);
|
|
Add(TablePropertiesNames::kDeletedKeys, props.num_deletions);
|
|
Add(TablePropertiesNames::kMergeOperands, props.num_merge_operands);
|
|
Add(TablePropertiesNames::kNumRangeDeletions, props.num_range_deletions);
|
|
Add(TablePropertiesNames::kNumDataBlocks, props.num_data_blocks);
|
|
Add(TablePropertiesNames::kFilterSize, props.filter_size);
|
|
Add(TablePropertiesNames::kFormatVersion, props.format_version);
|
|
Add(TablePropertiesNames::kFixedKeyLen, props.fixed_key_len);
|
|
Add(TablePropertiesNames::kColumnFamilyId, props.column_family_id);
|
|
Add(TablePropertiesNames::kCreationTime, props.creation_time);
|
|
Add(TablePropertiesNames::kOldestKeyTime, props.oldest_key_time);
|
|
Add(TablePropertiesNames::kNewestKeyTime, props.newest_key_time);
|
|
if (props.file_creation_time > 0) {
|
|
Add(TablePropertiesNames::kFileCreationTime, props.file_creation_time);
|
|
}
|
|
if (props.slow_compression_estimated_data_size > 0) {
|
|
Add(TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
props.slow_compression_estimated_data_size);
|
|
}
|
|
if (props.fast_compression_estimated_data_size > 0) {
|
|
Add(TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
props.fast_compression_estimated_data_size);
|
|
}
|
|
Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset);
|
|
if (props.user_defined_timestamps_persisted == 0) {
|
|
Add(TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
props.user_defined_timestamps_persisted);
|
|
}
|
|
if (!props.db_id.empty()) {
|
|
Add(TablePropertiesNames::kDbId, props.db_id);
|
|
}
|
|
if (!props.db_session_id.empty()) {
|
|
Add(TablePropertiesNames::kDbSessionId, props.db_session_id);
|
|
}
|
|
if (!props.db_host_id.empty()) {
|
|
Add(TablePropertiesNames::kDbHostId, props.db_host_id);
|
|
}
|
|
|
|
if (!props.filter_policy_name.empty()) {
|
|
Add(TablePropertiesNames::kFilterPolicy, props.filter_policy_name);
|
|
}
|
|
if (!props.comparator_name.empty()) {
|
|
Add(TablePropertiesNames::kComparator, props.comparator_name);
|
|
}
|
|
|
|
if (!props.merge_operator_name.empty()) {
|
|
Add(TablePropertiesNames::kMergeOperator, props.merge_operator_name);
|
|
}
|
|
if (!props.prefix_extractor_name.empty()) {
|
|
Add(TablePropertiesNames::kPrefixExtractorName,
|
|
props.prefix_extractor_name);
|
|
}
|
|
if (!props.property_collectors_names.empty()) {
|
|
Add(TablePropertiesNames::kPropertyCollectors,
|
|
props.property_collectors_names);
|
|
}
|
|
if (!props.column_family_name.empty()) {
|
|
Add(TablePropertiesNames::kColumnFamilyName, props.column_family_name);
|
|
}
|
|
|
|
if (!props.compression_name.empty()) {
|
|
Add(TablePropertiesNames::kCompression, props.compression_name);
|
|
}
|
|
if (!props.compression_options.empty()) {
|
|
Add(TablePropertiesNames::kCompressionOptions, props.compression_options);
|
|
}
|
|
if (!props.seqno_to_time_mapping.empty()) {
|
|
Add(TablePropertiesNames::kSequenceNumberTimeMapping,
|
|
props.seqno_to_time_mapping);
|
|
}
|
|
if (props.key_largest_seqno != UINT64_MAX) {
|
|
Add(TablePropertiesNames::kKeyLargestSeqno, props.key_largest_seqno);
|
|
}
|
|
if (props.key_smallest_seqno != UINT64_MAX) {
|
|
Add(TablePropertiesNames::kKeySmallestSeqno, props.key_smallest_seqno);
|
|
}
|
|
}
|
|
|
|
Slice PropertyBlockBuilder::Finish() {
|
|
for (const auto& prop : props_) {
|
|
assert(last_prop_added_to_block_.empty() ||
|
|
comparator_->Compare(prop.first, last_prop_added_to_block_) > 0);
|
|
properties_block_->Add(prop.first, prop.second);
|
|
#ifndef NDEBUG
|
|
last_prop_added_to_block_ = prop.first;
|
|
#endif /* !NDEBUG */
|
|
}
|
|
|
|
return properties_block_->Finish();
|
|
}
|
|
|
|
void LogPropertiesCollectionError(Logger* info_log, const std::string& method,
|
|
const std::string& name) {
|
|
assert(method == "Add" || method == "Finish");
|
|
|
|
std::string msg =
|
|
"Encountered error when calling TablePropertiesCollector::" + method +
|
|
"() with collector name: " + name;
|
|
ROCKS_LOG_ERROR(info_log, "%s", msg.c_str());
|
|
}
|
|
|
|
bool NotifyCollectTableCollectorsOnAdd(
|
|
const Slice& key, const Slice& value, uint64_t file_size,
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
Logger* info_log) {
|
|
bool all_succeeded = true;
|
|
for (auto& collector : collectors) {
|
|
Status s = collector->InternalAdd(key, value, file_size);
|
|
all_succeeded = all_succeeded && s.ok();
|
|
if (!s.ok()) {
|
|
LogPropertiesCollectionError(info_log, "Add" /* method */,
|
|
collector->Name());
|
|
}
|
|
}
|
|
return all_succeeded;
|
|
}
|
|
|
|
void NotifyCollectTableCollectorsOnBlockAdd(
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
const uint64_t block_uncomp_bytes,
|
|
const uint64_t block_compressed_bytes_fast,
|
|
const uint64_t block_compressed_bytes_slow) {
|
|
for (auto& collector : collectors) {
|
|
collector->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
|
|
block_compressed_bytes_slow);
|
|
}
|
|
}
|
|
|
|
bool NotifyCollectTableCollectorsOnFinish(
|
|
const std::vector<std::unique_ptr<InternalTblPropColl>>& collectors,
|
|
Logger* info_log, PropertyBlockBuilder* builder,
|
|
UserCollectedProperties& user_collected_properties,
|
|
UserCollectedProperties& readable_properties) {
|
|
bool all_succeeded = true;
|
|
for (auto& collector : collectors) {
|
|
UserCollectedProperties user_properties;
|
|
Status s = collector->Finish(&user_properties);
|
|
if (s.ok()) {
|
|
for (const auto& prop : collector->GetReadableProperties()) {
|
|
readable_properties.insert(prop);
|
|
}
|
|
#ifndef NDEBUG
|
|
// Check different user properties collectors are not adding properties of
|
|
// the same name.
|
|
for (const auto& pair : user_properties) {
|
|
assert(user_collected_properties.find(pair.first) ==
|
|
user_collected_properties.end());
|
|
}
|
|
#endif /* !NDEBUG */
|
|
user_collected_properties.merge(user_properties);
|
|
} else {
|
|
LogPropertiesCollectionError(info_log, "Finish" /* method */,
|
|
collector->Name());
|
|
if (all_succeeded) {
|
|
all_succeeded = false;
|
|
}
|
|
}
|
|
}
|
|
builder->Add(user_collected_properties);
|
|
return all_succeeded;
|
|
}
|
|
|
|
Status ParsePropertiesBlock(
|
|
const ImmutableOptions& ioptions, uint64_t offset, Block& properties_block,
|
|
std::unique_ptr<TableProperties>& new_table_properties) {
|
|
std::unique_ptr<MetaBlockIter> iter(properties_block.NewMetaIterator());
|
|
|
|
// All pre-defined properties of type uint64_t
|
|
std::unordered_map<std::string, uint64_t*> predefined_uint64_properties = {
|
|
{TablePropertiesNames::kOriginalFileNumber,
|
|
&new_table_properties->orig_file_number},
|
|
{TablePropertiesNames::kDataSize, &new_table_properties->data_size},
|
|
{TablePropertiesNames::kIndexSize, &new_table_properties->index_size},
|
|
{TablePropertiesNames::kIndexPartitions,
|
|
&new_table_properties->index_partitions},
|
|
{TablePropertiesNames::kTopLevelIndexSize,
|
|
&new_table_properties->top_level_index_size},
|
|
{TablePropertiesNames::kIndexKeyIsUserKey,
|
|
&new_table_properties->index_key_is_user_key},
|
|
{TablePropertiesNames::kIndexValueIsDeltaEncoded,
|
|
&new_table_properties->index_value_is_delta_encoded},
|
|
{TablePropertiesNames::kFilterSize, &new_table_properties->filter_size},
|
|
{TablePropertiesNames::kRawKeySize, &new_table_properties->raw_key_size},
|
|
{TablePropertiesNames::kRawValueSize,
|
|
&new_table_properties->raw_value_size},
|
|
{TablePropertiesNames::kNumDataBlocks,
|
|
&new_table_properties->num_data_blocks},
|
|
{TablePropertiesNames::kNumEntries, &new_table_properties->num_entries},
|
|
{TablePropertiesNames::kNumFilterEntries,
|
|
&new_table_properties->num_filter_entries},
|
|
{TablePropertiesNames::kDeletedKeys,
|
|
&new_table_properties->num_deletions},
|
|
{TablePropertiesNames::kMergeOperands,
|
|
&new_table_properties->num_merge_operands},
|
|
{TablePropertiesNames::kNumRangeDeletions,
|
|
&new_table_properties->num_range_deletions},
|
|
{TablePropertiesNames::kFormatVersion,
|
|
&new_table_properties->format_version},
|
|
{TablePropertiesNames::kFixedKeyLen,
|
|
&new_table_properties->fixed_key_len},
|
|
{TablePropertiesNames::kColumnFamilyId,
|
|
&new_table_properties->column_family_id},
|
|
{TablePropertiesNames::kCreationTime,
|
|
&new_table_properties->creation_time},
|
|
{TablePropertiesNames::kOldestKeyTime,
|
|
&new_table_properties->oldest_key_time},
|
|
{TablePropertiesNames::kNewestKeyTime,
|
|
&new_table_properties->newest_key_time},
|
|
{TablePropertiesNames::kFileCreationTime,
|
|
&new_table_properties->file_creation_time},
|
|
{TablePropertiesNames::kSlowCompressionEstimatedDataSize,
|
|
&new_table_properties->slow_compression_estimated_data_size},
|
|
{TablePropertiesNames::kFastCompressionEstimatedDataSize,
|
|
&new_table_properties->fast_compression_estimated_data_size},
|
|
{TablePropertiesNames::kTailStartOffset,
|
|
&new_table_properties->tail_start_offset},
|
|
{TablePropertiesNames::kUserDefinedTimestampsPersisted,
|
|
&new_table_properties->user_defined_timestamps_persisted},
|
|
{TablePropertiesNames::kKeyLargestSeqno,
|
|
&new_table_properties->key_largest_seqno},
|
|
{TablePropertiesNames::kKeySmallestSeqno,
|
|
&new_table_properties->key_smallest_seqno},
|
|
};
|
|
|
|
Status s;
|
|
std::string last_key;
|
|
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
|
|
s = iter->status();
|
|
if (!s.ok()) {
|
|
break;
|
|
}
|
|
|
|
auto key = iter->key().ToString();
|
|
// properties block should be strictly sorted with no duplicate key.
|
|
if (!last_key.empty() &&
|
|
BytewiseComparator()->Compare(key, last_key) <= 0) {
|
|
s = Status::Corruption("properties unsorted");
|
|
break;
|
|
}
|
|
last_key = key;
|
|
|
|
auto raw_val = iter->value();
|
|
auto pos = predefined_uint64_properties.find(key);
|
|
|
|
if (key == ExternalSstFilePropertyNames::kGlobalSeqno) {
|
|
new_table_properties->external_sst_file_global_seqno_offset =
|
|
offset + iter->ValueOffset();
|
|
}
|
|
|
|
if (pos != predefined_uint64_properties.end()) {
|
|
if (key == TablePropertiesNames::kDeletedKeys ||
|
|
key == TablePropertiesNames::kMergeOperands) {
|
|
// Insert in user-collected properties for API backwards compatibility
|
|
new_table_properties->user_collected_properties.insert(
|
|
{key, raw_val.ToString()});
|
|
}
|
|
// handle predefined rocksdb properties
|
|
uint64_t val;
|
|
if (!GetVarint64(&raw_val, &val)) {
|
|
// skip malformed value
|
|
auto error_msg =
|
|
"Detect malformed value in properties meta-block:"
|
|
"\tkey: " +
|
|
key + "\tval: " + raw_val.ToString();
|
|
ROCKS_LOG_ERROR(ioptions.logger, "%s", error_msg.c_str());
|
|
continue;
|
|
}
|
|
*(pos->second) = val;
|
|
} else if (key == TablePropertiesNames::kDbId) {
|
|
new_table_properties->db_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kDbSessionId) {
|
|
new_table_properties->db_session_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kDbHostId) {
|
|
new_table_properties->db_host_id = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kFilterPolicy) {
|
|
new_table_properties->filter_policy_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kColumnFamilyName) {
|
|
new_table_properties->column_family_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kComparator) {
|
|
new_table_properties->comparator_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kMergeOperator) {
|
|
new_table_properties->merge_operator_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kPrefixExtractorName) {
|
|
new_table_properties->prefix_extractor_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kPropertyCollectors) {
|
|
new_table_properties->property_collectors_names = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kCompression) {
|
|
new_table_properties->compression_name = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kCompressionOptions) {
|
|
new_table_properties->compression_options = raw_val.ToString();
|
|
} else if (key == TablePropertiesNames::kSequenceNumberTimeMapping) {
|
|
new_table_properties->seqno_to_time_mapping = raw_val.ToString();
|
|
} else {
|
|
// handle user-collected properties
|
|
new_table_properties->user_collected_properties.insert(
|
|
{key, raw_val.ToString()});
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
// FIXME: should be a parameter for reading table properties to use persistent
|
|
// cache?
|
|
Status ReadTablePropertiesHelper(
|
|
const ReadOptions& ro, const BlockHandle& handle,
|
|
RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer,
|
|
const Footer& footer, const ImmutableOptions& ioptions,
|
|
std::unique_ptr<TableProperties>* table_properties,
|
|
MemoryAllocator* memory_allocator) {
|
|
assert(table_properties);
|
|
|
|
Status s;
|
|
bool retry = false;
|
|
while (true) {
|
|
BlockContents block_contents;
|
|
size_t len = handle.size() + footer.GetBlockTrailerSize();
|
|
// If this is an external SST file ingested with write_global_seqno set to
|
|
// true, then we expect the checksum mismatch because checksum was written
|
|
// by SstFileWriter, but its global seqno in the properties block may have
|
|
// been changed during ingestion. For this reason, we initially read
|
|
// and process without checksum verification, then later try checksum
|
|
// verification so that if it fails, we can copy to a temporary buffer with
|
|
// global seqno set to its original value, i.e. 0, and attempt checksum
|
|
// verification again.
|
|
if (!retry) {
|
|
ReadOptions modified_ro = ro;
|
|
modified_ro.verify_checksums = false;
|
|
BlockFetcher block_fetcher(
|
|
file, prefetch_buffer, footer, modified_ro, handle, &block_contents,
|
|
ioptions, false /* decompress */, false /*maybe_compressed*/,
|
|
BlockType::kProperties, nullptr /*decompressor*/,
|
|
PersistentCacheOptions::kEmpty, memory_allocator);
|
|
s = block_fetcher.ReadBlockContents();
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
assert(block_fetcher.GetBlockSizeWithTrailer() == len);
|
|
TEST_SYNC_POINT_CALLBACK("ReadTablePropertiesHelper:0",
|
|
&block_contents.data);
|
|
} else {
|
|
assert(s.IsCorruption());
|
|
// If retrying, use a stronger file system read to check and correct
|
|
// data corruption
|
|
IOOptions opts;
|
|
IODebugContext dbg;
|
|
if (PrepareIOFromReadOptions(ro, ioptions.clock, opts, &dbg) !=
|
|
IOStatus::OK()) {
|
|
return s;
|
|
}
|
|
opts.verify_and_reconstruct_read = true;
|
|
std::unique_ptr<char[]> data(new char[len]);
|
|
Slice result;
|
|
IOStatus io_s = file->Read(opts, handle.offset(), len, &result,
|
|
data.get(), nullptr, &dbg);
|
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_COUNT);
|
|
if (!io_s.ok()) {
|
|
ROCKS_LOG_INFO(ioptions.info_log,
|
|
"Reading properties block failed - %s",
|
|
io_s.ToString().c_str());
|
|
// Return the original corruption error as that's more serious
|
|
return s;
|
|
}
|
|
if (result.size() < len) {
|
|
return Status::Corruption("Reading properties block failed - " +
|
|
std::to_string(result.size()) +
|
|
" bytes read");
|
|
}
|
|
RecordTick(ioptions.stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
|
|
block_contents = BlockContents(std::move(data), handle.size());
|
|
}
|
|
|
|
uint64_t block_size = block_contents.data.size();
|
|
Block properties_block(std::move(block_contents));
|
|
std::unique_ptr<TableProperties> new_table_properties{new TableProperties};
|
|
s = ParsePropertiesBlock(ioptions, handle.offset(), properties_block,
|
|
new_table_properties);
|
|
|
|
// Modified version of BlockFetcher checksum verification
|
|
// (See write_global_seqno comment above)
|
|
if (s.ok() && footer.GetBlockTrailerSize() > 0) {
|
|
s = VerifyBlockChecksum(footer, properties_block.data(), block_size,
|
|
file->file_name(), handle.offset());
|
|
if (s.IsCorruption()) {
|
|
if (new_table_properties->external_sst_file_global_seqno_offset != 0) {
|
|
std::string tmp_buf(properties_block.data(), len);
|
|
uint64_t global_seqno_offset =
|
|
new_table_properties->external_sst_file_global_seqno_offset -
|
|
handle.offset();
|
|
EncodeFixed64(&tmp_buf[static_cast<size_t>(global_seqno_offset)], 0);
|
|
s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size,
|
|
file->file_name(), handle.offset());
|
|
}
|
|
}
|
|
}
|
|
|
|
// If we detected a corruption and the file system supports verification
|
|
// and reconstruction, retry the read
|
|
if (s.IsCorruption() && !retry &&
|
|
CheckFSFeatureSupport(ioptions.fs.get(),
|
|
FSSupportedOps::kVerifyAndReconstructRead)) {
|
|
retry = true;
|
|
} else {
|
|
if (s.ok()) {
|
|
*table_properties = std::move(new_table_properties);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return s;
|
|
}
|
|
|
|
Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size,
|
|
uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
std::unique_ptr<TableProperties>* properties,
|
|
MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer) {
|
|
BlockHandle block_handle;
|
|
Footer footer;
|
|
Status s =
|
|
FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
|
|
read_options, kPropertiesBlockName, &block_handle,
|
|
memory_allocator, prefetch_buffer, &footer);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
|
|
if (!block_handle.IsNull()) {
|
|
s = ReadTablePropertiesHelper(read_options, block_handle, file,
|
|
prefetch_buffer, footer, ioptions, properties,
|
|
memory_allocator);
|
|
} else {
|
|
s = Status::NotFound();
|
|
}
|
|
return s;
|
|
}
|
|
|
|
Status FindOptionalMetaBlock(InternalIterator* meta_index_iter,
|
|
const std::string& meta_block_name,
|
|
BlockHandle* block_handle) {
|
|
assert(block_handle != nullptr);
|
|
meta_index_iter->Seek(meta_block_name);
|
|
if (meta_index_iter->status().ok()) {
|
|
if (meta_index_iter->Valid() && meta_index_iter->key() == meta_block_name) {
|
|
Slice v = meta_index_iter->value();
|
|
return block_handle->DecodeFrom(&v);
|
|
} else if (meta_block_name == kPropertiesBlockName) {
|
|
// Have to try old name for compatibility
|
|
meta_index_iter->Seek(kPropertiesBlockOldName);
|
|
if (meta_index_iter->status().ok() && meta_index_iter->Valid() &&
|
|
meta_index_iter->key() == kPropertiesBlockOldName) {
|
|
Slice v = meta_index_iter->value();
|
|
return block_handle->DecodeFrom(&v);
|
|
}
|
|
}
|
|
}
|
|
// else
|
|
*block_handle = BlockHandle::NullBlockHandle();
|
|
return meta_index_iter->status();
|
|
}
|
|
|
|
Status FindMetaBlock(InternalIterator* meta_index_iter,
|
|
const std::string& meta_block_name,
|
|
BlockHandle* block_handle) {
|
|
Status s =
|
|
FindOptionalMetaBlock(meta_index_iter, meta_block_name, block_handle);
|
|
if (s.ok() && block_handle->IsNull()) {
|
|
return Status::Corruption("Cannot find the meta block", meta_block_name);
|
|
} else {
|
|
return s;
|
|
}
|
|
}
|
|
|
|
Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file,
|
|
uint64_t file_size, uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
BlockContents* metaindex_contents,
|
|
MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer,
|
|
Footer* footer_out) {
|
|
Footer footer;
|
|
IOOptions opts;
|
|
IODebugContext dbg;
|
|
Status s;
|
|
s = file->PrepareIOOptions(read_options, opts, &dbg);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size,
|
|
&footer, table_magic_number, ioptions.stats);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
if (footer_out) {
|
|
*footer_out = footer;
|
|
}
|
|
|
|
auto metaindex_handle = footer.metaindex_handle();
|
|
return BlockFetcher(file, prefetch_buffer, footer, read_options,
|
|
metaindex_handle, metaindex_contents, ioptions,
|
|
false /* do decompression */, false /*maybe_compressed*/,
|
|
BlockType::kMetaIndex, nullptr /*decompressor*/,
|
|
PersistentCacheOptions::kEmpty, memory_allocator)
|
|
.ReadBlockContents();
|
|
}
|
|
|
|
Status FindMetaBlockInFile(
|
|
RandomAccessFileReader* file, uint64_t file_size,
|
|
uint64_t table_magic_number, const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options, const std::string& meta_block_name,
|
|
BlockHandle* block_handle, MemoryAllocator* memory_allocator,
|
|
FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) {
|
|
BlockContents metaindex_contents;
|
|
auto s = ReadMetaIndexBlockInFile(
|
|
file, file_size, table_magic_number, ioptions, read_options,
|
|
&metaindex_contents, memory_allocator, prefetch_buffer, footer_out);
|
|
if (!s.ok()) {
|
|
return s;
|
|
}
|
|
// meta blocks are never compressed. Need to add uncompress logic if we are to
|
|
// compress it.
|
|
Block metaindex_block(std::move(metaindex_contents));
|
|
|
|
std::unique_ptr<InternalIterator> meta_iter;
|
|
meta_iter.reset(metaindex_block.NewMetaIterator());
|
|
|
|
return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle);
|
|
}
|
|
|
|
Status ReadMetaBlock(RandomAccessFileReader* file,
|
|
FilePrefetchBuffer* prefetch_buffer, uint64_t file_size,
|
|
uint64_t table_magic_number,
|
|
const ImmutableOptions& ioptions,
|
|
const ReadOptions& read_options,
|
|
const std::string& meta_block_name, BlockType block_type,
|
|
BlockContents* contents,
|
|
MemoryAllocator* memory_allocator) {
|
|
// TableProperties requires special handling because of checksum issues.
|
|
// Call ReadTableProperties instead for that case.
|
|
assert(block_type != BlockType::kProperties);
|
|
|
|
BlockHandle block_handle;
|
|
Footer footer;
|
|
Status status =
|
|
FindMetaBlockInFile(file, file_size, table_magic_number, ioptions,
|
|
read_options, meta_block_name, &block_handle,
|
|
memory_allocator, prefetch_buffer, &footer);
|
|
if (!status.ok()) {
|
|
return status;
|
|
}
|
|
|
|
return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle,
|
|
contents, ioptions, false /* decompress */,
|
|
false /*maybe_compressed*/, block_type,
|
|
nullptr /*decompressor*/, PersistentCacheOptions::kEmpty,
|
|
memory_allocator)
|
|
.ReadBlockContents();
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|