rocksdb/utilities/trie_index/trie_index_factory.cc
zaidoon ec22903914 Support all operation types in User Defined Index (UDI) interface (#14399)
Summary:
Remove the restriction that limited UDI to ingest-only, Puts-only use cases. This enables UDI plugins (including the trie index from https://github.com/facebook/rocksdb/issues/14310) to work with all operation types: Put, Delete, Merge, SingleDelete, PutEntity, etc.

Part of https://github.com/facebook/rocksdb/issues/12396

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14399

Reviewed By: pdillinger

Differential Revision: D96392636

Pulled By: xingbowang

fbshipit-source-id: 0f1e6c38531fa72539a0e2c6a3dffff333392b4c
2026-03-16 15:25:05 -07:00

567 lines
23 KiB
C++

// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "utilities/trie_index/trie_index_factory.h"
#include <algorithm>
#include <cassert>
#include <cstring>
#include "db/dbformat.h"
#include "rocksdb/comparator.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
namespace trie_index {
// ============================================================================
// TrieIndexBuilder
// ============================================================================
TrieIndexBuilder::TrieIndexBuilder(const Comparator* comparator)
: comparator_(comparator),
finished_(false),
must_use_separator_with_seq_(false) {}
Slice TrieIndexBuilder::AddIndexEntry(const Slice& last_key_in_current_block,
const Slice* first_key_in_next_block,
const BlockHandle& block_handle,
std::string* separator_scratch,
const IndexEntryContext& context) {
SequenceNumber last_key_seq = context.last_key_seq;
// Compute a short separator between the two user keys using the
// comparator. FindShortestSeparator takes `*start` as both input and output:
// input: *start == last_key_in_current_block
// output: *start modified to shortest string in [start, limit)
// If first_key_in_next_block is nullptr, this is the last block — use a
// short successor of the last key.
Slice separator;
// True when last_key and first_key_in_next_block are the same user key
// (same-user-key block boundary). Computed once and reused below for
// both the sticky flag and the per-entry seqno decision.
bool same_user_key = false;
if (first_key_in_next_block != nullptr) {
same_user_key = comparator_->Compare(last_key_in_current_block,
*first_key_in_next_block) == 0;
*separator_scratch = last_key_in_current_block.ToString();
comparator_->FindShortestSeparator(separator_scratch,
*first_key_in_next_block);
separator = Slice(*separator_scratch);
// Detect same-user-key block boundary: if the two user keys are identical,
// FindShortestSeparator returns the same key for both sides, making it
// impossible to distinguish the two blocks. Set the sticky flag so that
// at Finish() time, ALL separators will include encoded seqnos.
// This mirrors ShortenedIndexBuilder::must_use_separator_with_seq_.
if (!must_use_separator_with_seq_ && same_user_key) {
must_use_separator_with_seq_ = true;
}
// Edge case: FindShortestSeparator may fail to shorten the key even when
// the user keys are different. Example: FindShortestSeparator("abc","abd")
// returns "abc" unchanged because incrementing 'c' would yield "abd" which
// is not < limit. When the resulting separator matches the previous entry's
// separator, the blocks will be grouped into the same run in Finish().
// We must mark this as a same-user-key boundary so it gets a real seqno
// rather than kMaxSequenceNumber (which would trigger the overflow block
// assertion in Finish()).
if (!same_user_key && !buffered_entries_.empty() &&
buffered_entries_.back().separator_key == *separator_scratch) {
same_user_key = true;
if (!must_use_separator_with_seq_) {
must_use_separator_with_seq_ = true;
}
}
} else {
// Last block: use the last key itself as the separator, NOT a shortened
// successor. This matches the standard ShortenedIndexBuilder behavior
// (see index_builder.h GetSeparatorWithSeq lines 278-286): it only calls
// FindShortInternalKeySuccessor when shortening_mode is
// kShortenSeparatorsAndSuccessor, which is not the default. With the
// default kShortenSeparators, the last block's separator is simply
// last_key_in_current_block.
//
// Why this matters: FindShortSuccessor can widen the key range. For
// example, if the actual last key is "9\xff\xff", FindShortSuccessor
// produces ":" (0x3A). The trie would then claim to cover keys up to
// ":", but the data block only contains keys up to "9\xff\xff". A seek
// targeting a key in that gap (e.g., "9\xff\xff\x01") would find a
// block via the trie that contains no matching data, causing iterator
// desynchronization — the trie index returns a valid block while the
// standard index correctly reports no match.
separator = last_key_in_current_block;
// Edge case: if this last block's separator matches the previous entry's
// separator, they share the same user key (same-user-key run boundary).
if (!buffered_entries_.empty() &&
comparator_->Compare(buffered_entries_.back().separator_key,
separator) == 0) {
same_user_key = true;
if (!must_use_separator_with_seq_) {
must_use_separator_with_seq_ = true;
}
}
}
// Buffer the entry for deferred trie construction in Finish().
// We buffer rather than adding to the trie immediately because the
// all-or-nothing seqno encoding decision is made at Finish() time.
TrieBlockHandle handle;
handle.offset = block_handle.offset;
handle.size = block_handle.size;
BufferedEntry entry;
entry.separator_key = separator.ToString();
// For same-user-key boundaries, use the actual seqno of the last key.
// For different-user-key boundaries, use kMaxSequenceNumber (sentinel
// meaning "this is not a same-key boundary, never advance past it").
if (same_user_key) {
entry.seqno = last_key_seq;
} else {
entry.seqno = kMaxSequenceNumber;
}
entry.handle = handle;
buffered_entries_.push_back(std::move(entry));
return separator;
}
void TrieIndexBuilder::OnKeyAdded(const Slice& /*key*/, ValueType /*type*/,
const Slice& /*value*/) {
// No-op: the trie is built from separator keys in AddIndexEntry(), not
// from individual key-value pairs.
}
Status TrieIndexBuilder::Finish(Slice* index_contents) {
if (finished_) {
return Status::InvalidArgument("TrieIndexBuilder::Finish called twice");
}
finished_ = true;
// Use seqno side-table when any same-user-key block boundary was detected.
// The must_use_separator_with_seq_ flag is set in AddIndexEntry() whenever
// the comparator finds two identical user keys at a block boundary. This
// always implies duplicate separators exist (since
// FindShortestSeparator("foo", "foo") = "foo"), so no separate scan is
// needed.
bool use_seqno = must_use_separator_with_seq_;
trie_builder_.SetHasSeqnoEncoding(use_seqno);
if (use_seqno) {
// Feed de-duplicated separators to the trie with seqno side-table metadata.
// Consecutive identical separators form a "run" — only the first occurrence
// goes into the trie (as the primary block). The remaining blocks in the
// run are stored as overflow blocks in the side-table.
//
// For non-boundary separators (different user keys), seqno is set to 0
// (sentinel = "never advance past this leaf"). kMaxSequenceNumber from
// AddIndexEntry is mapped to 0 here.
size_t i = 0;
while (i < buffered_entries_.size()) {
const auto& entry = buffered_entries_[i];
// Count how many consecutive entries share this separator key.
size_t run_start = i;
size_t run_end = i + 1;
while (run_end < buffered_entries_.size() &&
buffered_entries_[run_end].separator_key == entry.separator_key) {
run_end++;
}
uint32_t block_count = static_cast<uint32_t>(run_end - run_start);
// Map kMaxSequenceNumber (non-same-key boundary) to 0 (sentinel).
uint64_t seqno = (entry.seqno == kMaxSequenceNumber) ? 0 : entry.seqno;
// Add the primary (first) block for this separator.
trie_builder_.AddKeyWithSeqno(Slice(entry.separator_key), entry.handle,
seqno, block_count);
// Add overflow blocks (2nd, 3rd, ... in the run).
// Overflow blocks only exist within same-key runs, so their seqnos
// come from last_key_seq in AddIndexEntry (never kMaxSequenceNumber).
// The seqno may be 0 when bottommost compaction zeroes all sequence
// numbers — this is valid; see AddOverflowBlock comment.
for (size_t j = run_start + 1; j < run_end; j++) {
assert(buffered_entries_[j].seqno != kMaxSequenceNumber);
trie_builder_.AddOverflowBlock(buffered_entries_[j].handle,
buffered_entries_[j].seqno);
}
i = run_end;
}
} else {
// Common case: no same-user-key boundaries, add separators directly.
// Zero overhead — no seqno data stored.
for (const auto& entry : buffered_entries_) {
trie_builder_.AddKey(Slice(entry.separator_key), entry.handle);
}
}
// Release buffered entries — no longer needed after feeding to the trie.
buffered_entries_.clear();
buffered_entries_.shrink_to_fit();
// Always finish the trie builder, even with 0 keys — this produces a valid
// serialized trie that can be parsed by NewReader. Without this, an empty
// Slice would be returned, causing InitFromData to fail with "data too short
// for header".
trie_builder_.Finish();
*index_contents = trie_builder_.GetSerializedData();
return Status::OK();
}
// ============================================================================
// TrieIndexIterator
// ============================================================================
TrieIndexIterator::TrieIndexIterator(const LoudsTrie* trie,
const Comparator* comparator,
bool has_seqno_encoding)
: comparator_(comparator),
iter_(trie),
trie_(trie),
current_scan_idx_(0),
prepared_(false),
has_seqno_encoding_(has_seqno_encoding),
overflow_run_index_(0),
overflow_run_size_(1),
overflow_base_idx_(0) {}
void TrieIndexIterator::Prepare(const ScanOptions scan_opts[],
size_t num_opts) {
scan_opts_.clear();
scan_opts_.reserve(num_opts);
for (size_t i = 0; i < num_opts; i++) {
scan_opts_.push_back(scan_opts[i]);
}
current_scan_idx_ = 0;
prepared_ = true;
}
Status TrieIndexIterator::SeekToFirstAndGetResult(IterateResult* result) {
// Reset overflow state — SeekToFirst always lands on the primary block
// of the first trie leaf.
overflow_run_index_ = 0;
overflow_run_size_ = 1;
overflow_base_idx_ = 0;
if (!iter_.SeekToFirst()) {
result->bound_check_result = IterBoundCheck::kUnknown;
result->key = Slice();
return Status::OK();
}
result->key = iter_.Key();
current_key_scratch_ = result->key.ToString();
result->key = Slice(current_key_scratch_);
// Set up overflow state for the first leaf if seqno encoding is active.
if (has_seqno_encoding_) {
uint64_t leaf_idx = iter_.LeafIndex();
uint32_t block_count = trie_->GetLeafBlockCount(leaf_idx);
overflow_run_size_ = block_count;
overflow_base_idx_ = trie_->GetOverflowBase(leaf_idx);
}
// The very first entry is always in bounds (no target to compare against
// the limit, and the first block cannot precede any scan range).
result->bound_check_result = IterBoundCheck::kInbound;
return Status::OK();
}
Status TrieIndexIterator::SeekAndGetResult(const Slice& target,
IterateResult* result,
const SeekContext& context) {
SequenceNumber target_seq = context.target_seq;
// Advance current_scan_idx_ past any scans whose limit <= target.
// This handles the multi-scan case where the caller seeks into a later
// scan range after the previous scan returned kOutOfBound.
if (prepared_) {
while (current_scan_idx_ < scan_opts_.size()) {
const auto& opts = scan_opts_[current_scan_idx_];
if (opts.range.limit.has_value() &&
comparator_->Compare(target, opts.range.limit.value()) >= 0) {
current_scan_idx_++;
} else {
break;
}
}
}
// Reset overflow state.
overflow_run_index_ = 0;
overflow_run_size_ = 1;
overflow_base_idx_ = 0;
// Always seek with user key only — the trie stores user-key separators.
// When seqno encoding is active, post-seek correction handles the seqno.
if (!iter_.Seek(target)) {
// No leaf has a key >= target: the target is past all blocks in this SST.
// Return kUnknown (not kOutOfBound) because exhausting this SST's trie
// says nothing about the upper bound — the next SST on the level may
// still contain in-bound keys. kOutOfBound would cause LevelIterator to
// stop scanning the level prematurely.
result->bound_check_result = IterBoundCheck::kUnknown;
result->key = Slice();
return Status::OK();
}
// Set the result key (always a user key, no suffix stripping needed).
result->key = iter_.Key();
current_key_scratch_ = result->key.ToString();
result->key = Slice(current_key_scratch_);
// ---- Post-seek correction for seqno side-table ----
//
// When has_seqno_encoding_ is true, the leaf we landed on may have a
// seqno side-table entry. We use it to determine if this is the right
// block for the given (target, target_seq).
//
// The trie stores separators that are upper bounds on block contents:
// separator_key >= all keys in the block
// separator_seqno = seqno of the last key written to the block
//
// For same-user-key boundaries, the separator IS the user key. The seqno
// determines which block within a run of same-key blocks is correct:
// - If target_seq >= leaf_seqno: this is the right block (target's
// internal key <= separator's internal key, because higher seqno means
// "smaller" internal key for the same user key)
// - If target_seq < leaf_seqno: target's internal key > separator,
// so we need to advance to the next block in the run
//
// For non-boundary leaves (leaf_seqno == 0), the `leaf_seqno != 0` guard
// short-circuits before the comparison, so we never advance. This is the
// zero-overhead common path.
if (has_seqno_encoding_ && iter_.Valid()) {
uint64_t leaf_idx = iter_.LeafIndex();
uint64_t leaf_seqno = trie_->GetLeafSeqno(leaf_idx);
if (leaf_seqno != 0 && target_seq < leaf_seqno) {
// Target's internal key is AFTER the separator (lower seqno = later
// in internal key order for same user key). Advance through overflow
// blocks.
uint32_t block_count = trie_->GetLeafBlockCount(leaf_idx);
uint32_t base = trie_->GetOverflowBase(leaf_idx);
bool found = false;
for (uint32_t oi = 0; oi < block_count - 1; oi++) {
uint64_t ov_seqno = trie_->GetOverflowSeqno(base + oi);
if (ov_seqno == 0 || target_seq >= ov_seqno) {
// This overflow block is the right one.
overflow_run_index_ = oi + 1; // 1-based (0 = primary)
overflow_run_size_ = block_count;
overflow_base_idx_ = base;
found = true;
break;
}
}
if (!found) {
// target_seq is below all seqnos in this run. Advance to the next
// trie leaf (the block after the run).
if (!iter_.Next()) {
// Exhausted all blocks: target is past the end of this SST.
// Return kUnknown — see comment in Seek path above.
result->bound_check_result = IterBoundCheck::kUnknown;
result->key = Slice();
return Status::OK();
}
// Update key and overflow state for the new leaf.
result->key = iter_.Key();
current_key_scratch_ = result->key.ToString();
result->key = Slice(current_key_scratch_);
overflow_run_index_ = 0;
overflow_run_size_ = 1;
overflow_base_idx_ = 0;
// Check if the new leaf also has overflow (unlikely but possible
// with adjacent same-key runs for different user keys).
// iter_.Valid() is guaranteed here — Next() returned true above.
if (has_seqno_encoding_) {
uint64_t new_leaf = iter_.LeafIndex();
overflow_run_size_ = trie_->GetLeafBlockCount(new_leaf);
overflow_base_idx_ = trie_->GetOverflowBase(new_leaf);
}
}
} else {
// Right block (common path). Set overflow state in case this leaf
// has a run (for subsequent Next() calls).
uint32_t block_count = trie_->GetLeafBlockCount(leaf_idx);
overflow_run_index_ = 0;
overflow_run_size_ = block_count;
overflow_base_idx_ = trie_->GetOverflowBase(leaf_idx);
}
}
result->bound_check_result = CheckBounds(target);
return Status::OK();
}
Status TrieIndexIterator::NextAndGetResult(IterateResult* result) {
// Save the current separator (user key) as "previous" before advancing.
prev_key_scratch_ = current_key_scratch_;
// If we're in an overflow run and haven't exhausted it, advance within
// the run (no trie traversal needed — just increment the overflow index).
if (overflow_run_index_ + 1 < overflow_run_size_) {
overflow_run_index_++;
// The key doesn't change (same separator for all blocks in the run).
result->key = Slice(current_key_scratch_);
result->bound_check_result = CheckBounds(Slice(prev_key_scratch_));
return Status::OK();
}
// Advance to the next trie leaf.
overflow_run_index_ = 0;
overflow_run_size_ = 1;
overflow_base_idx_ = 0;
if (!iter_.Next()) {
// No more blocks: past the end of this SST.
// Return kUnknown — see comment in Seek path above.
result->bound_check_result = IterBoundCheck::kUnknown;
result->key = Slice();
return Status::OK();
}
result->key = iter_.Key();
current_key_scratch_ = result->key.ToString();
result->key = Slice(current_key_scratch_);
// Set overflow state for the new leaf.
if (has_seqno_encoding_ && iter_.Valid()) {
uint64_t leaf_idx = iter_.LeafIndex();
overflow_run_size_ = trie_->GetLeafBlockCount(leaf_idx);
overflow_base_idx_ = trie_->GetOverflowBase(leaf_idx);
}
result->bound_check_result = CheckBounds(Slice(prev_key_scratch_));
return Status::OK();
}
UserDefinedIndexBuilder::BlockHandle TrieIndexIterator::value() {
if (overflow_run_index_ == 0) {
// Primary block — use the trie leaf's handle.
auto handle = iter_.Value();
return UserDefinedIndexBuilder::BlockHandle{handle.offset, handle.size};
}
// Overflow block — use the side-table handle.
// overflow_run_index_ is 1-based, overflow array is 0-based.
uint32_t overflow_idx = overflow_base_idx_ + overflow_run_index_ - 1;
auto handle = trie_->GetOverflowHandle(overflow_idx);
return UserDefinedIndexBuilder::BlockHandle{handle.offset, handle.size};
}
IterBoundCheck TrieIndexIterator::CheckBounds(
const Slice& reference_key) const {
if (!prepared_ || scan_opts_.empty()) {
// No bounds to check — always in-bound.
return IterBoundCheck::kInbound;
}
if (current_scan_idx_ >= scan_opts_.size()) {
return IterBoundCheck::kOutOfBound;
}
const auto& opts = scan_opts_[current_scan_idx_];
// Check upper bound (limit) against the reference key, NOT the current
// separator. The trie stores separator keys (upper bounds on block
// contents), so comparing the separator against the limit would
// prematurely reject blocks that contain keys < limit.
//
// For Seek: reference_key = seek target. If target < limit, the found
// block may contain keys within bounds.
// For Next: reference_key = previous separator. If prev_sep < limit,
// the current block may contain keys within bounds.
//
// This is conservative: it may return kInbound for a block that is fully
// out of bounds. The data-level iterator handles per-key filtering.
if (opts.range.limit.has_value()) {
const Slice& limit = opts.range.limit.value();
if (comparator_->Compare(reference_key, limit) >= 0) {
return IterBoundCheck::kOutOfBound;
}
}
return IterBoundCheck::kInbound;
}
// ============================================================================
// TrieIndexReader
// ============================================================================
TrieIndexReader::TrieIndexReader(const Comparator* comparator)
: comparator_(comparator), data_size_(0) {}
Status TrieIndexReader::InitFromSlice(const Slice& data) {
data_size_ = data.size();
return trie_.InitFromData(data);
}
std::unique_ptr<UserDefinedIndexIterator> TrieIndexReader::NewIterator(
const ReadOptions& /*read_options*/) {
return std::make_unique<TrieIndexIterator>(&trie_, comparator_,
trie_.HasSeqnoEncoding());
}
size_t TrieIndexReader::ApproximateMemoryUsage() const {
// The trie uses zero-copy pointers into the serialized data for bitvectors
// and handle arrays, so the base cost is the serialized data size. On top
// of that, InitFromData() heap-allocates child position lookup tables
// (s_child_start_pos_ and s_child_end_pos_) for Select-free sparse
// traversal — 8 bytes per sparse internal node.
return data_size_ + trie_.ApproximateAuxMemoryUsage();
}
// ============================================================================
// TrieIndexFactory
// ============================================================================
Status TrieIndexFactory::NewBuilder(
const UserDefinedIndexOption& option,
std::unique_ptr<UserDefinedIndexBuilder>& builder) const {
// The trie traverses keys byte-by-byte in lexicographic order, so it
// requires a bytewise comparator. Non-bytewise comparators (e.g.,
// ReverseBytewiseComparator or custom comparators) would produce separator
// keys in a different order than the trie's byte-level traversal, causing
// incorrect Seek results.
if (option.comparator != nullptr &&
option.comparator != BytewiseComparator()) {
return Status::NotSupported(
"TrieIndexFactory requires BytewiseComparator; got: ",
option.comparator->Name());
}
// Default to BytewiseComparator when null. The trie requires a bytewise
// comparator for separator key ordering; null would cause a dereference
// crash in AddIndexEntry when comparing keys.
const Comparator* cmp =
option.comparator ? option.comparator : BytewiseComparator();
builder = std::make_unique<TrieIndexBuilder>(cmp);
return Status::OK();
}
Status TrieIndexFactory::NewReader(
const UserDefinedIndexOption& option, Slice& index_block,
std::unique_ptr<UserDefinedIndexReader>& reader) const {
const Comparator* cmp =
option.comparator ? option.comparator : BytewiseComparator();
if (cmp != BytewiseComparator()) {
return Status::NotSupported(
"TrieIndexFactory requires BytewiseComparator; got: ", cmp->Name());
}
auto trie_reader = std::make_unique<TrieIndexReader>(cmp);
Status s = trie_reader->InitFromSlice(index_block);
if (!s.ok()) {
return s;
}
reader = std::move(trie_reader);
return Status::OK();
}
} // namespace trie_index
} // namespace ROCKSDB_NAMESPACE