Summary: based on the option comment, `ignore_range_deletions` was added due to the overhead of range deletions in read path when a DB does not use DeleteRange(). The current implementation should not have a noticeable performance difference in this case. `experimental::PromoteL0()` can be replaced by doing a manual compaction with proper CompactRangeOptions. There are some internal use of these option and API so we will remove them later after the usages are updated. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13500 Test Plan: comment change only. Performance: benchmark the performance difference with `ignore_range_deletions` and without (borrowed flag `universal_incremental` for this purpose), ran at the same time on the same machine. - random point get: - ignore_range_deletions=false: 343078 ops/sec - ignore_range_deletions=true: 340219 ops/sec (0.8% slower) ``` (for I in $(seq 1 1); do TEST_TMPDIR=/dev/shm/t1 /data/users/changyubi/vscode-root/rocksdb/db_bench --benchmarks=fillseq,waitforcompaction,readrandom --write_buffer_size=67108864 --writes=1000000 --num=2000000 --reads=1000000 --seed=1723056275 --universal_incremental=false 2>&1 | grep "readrandom"; done;) | awk '{ t += $5; c++; print } END { print 1.0 * t / c }'; ``` - sequential scan: - ignore_range_deletions=false: 5378104 ops/sec - ignore_range_deletions=true: 5393809 ops/sec (0.3% faster) ``` (for I in $(seq 1 10); do TEST_TMPDIR=/dev/shm/t1 /data/users/changyubi/vscode-root/rocksdb/db_bench --benchmarks=fillseq,waitforcompaction,readseq[-X10] --write_buffer_size=67108864 --writes=1000000 --num=2000000 --universal_incremental=true --seed=1723056275 2>1 | grep "\[AVG 10 runs\]"; done;) | awk '{ t += $6; c++; print; } END { printf "%.0f\n", 1.0 * t / c }'; ``` The difference in ops/sec for the two benchmarks is likely noise. Reviewed By: hx235 Differential Revision: D72069223 Pulled By: cbi42 fbshipit-source-id: ad82a051aa4682790d2178cd4fb2d1467397fbb5
708 lines
35 KiB
C++
708 lines
35 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <variant>
|
|
|
|
#include "rocksdb/data_structure.h"
|
|
#include "rocksdb/db.h"
|
|
#include "rocksdb/status.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace experimental {
|
|
|
|
// Supported only for Leveled compaction
|
|
Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
|
|
const Slice* begin, const Slice* end);
|
|
Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end);
|
|
|
|
// DEPRECATED: this API may be removed in a future release.
|
|
// This operation can be done through CompactRange() by setting
|
|
// CompactRangeOptions::bottommost_level_compaction set to
|
|
// BottommostLevelCompaction::kSkip and setting target level.
|
|
//
|
|
// Move all L0 files to target_level skipping compaction.
|
|
// This operation succeeds only if the files in L0 have disjoint ranges; this
|
|
// is guaranteed to happen, for instance, if keys are inserted in sorted
|
|
// order. Furthermore, all levels between 1 and target_level must be empty.
|
|
// If any of the above condition is violated, InvalidArgument will be
|
|
// returned.
|
|
Status PromoteL0(DB* db, ColumnFamilyHandle* column_family,
|
|
int target_level = 1);
|
|
|
|
// This function is intended to be called with the DB closed and does not write
|
|
// to the DB path. It will usually work on an open DB but may fail spuriously
|
|
// in that case, and results may be out of date on return.
|
|
Status GetFileChecksumsFromCurrentManifest(FileSystem* fs,
|
|
const std::string& dbname,
|
|
FileChecksumList* checksum_list);
|
|
|
|
struct UpdateManifestForFilesStateOptions {
|
|
// When true, read current file temperatures from FileSystem and update in
|
|
// DB manifest when a temperature other than Unknown is reported and
|
|
// inconsistent with manifest.
|
|
bool update_temperatures = true;
|
|
|
|
// TODO: new_checksums: to update files to latest file checksum algorithm
|
|
};
|
|
|
|
// Utility for updating manifest of DB directory (not open) for current state
|
|
// of files on filesystem. See UpdateManifestForFilesStateOptions.
|
|
//
|
|
// To minimize interference with ongoing DB operations, only the following
|
|
// guarantee is provided, assuming no IO error encountered:
|
|
// * Only files live in DB at start AND end of call to
|
|
// UpdateManifestForFilesState() are guaranteed to be updated (as needed) in
|
|
// manifest.
|
|
// * For example, new files after start of call to
|
|
// UpdateManifestForFilesState() might not be updated, but that is not
|
|
// typically required to achieve goal of manifest consistency/completeness
|
|
// (because current DB configuration would ensure new files get the desired
|
|
// consistent metadata).
|
|
Status UpdateManifestForFilesState(
|
|
const DBOptions& db_opts, const std::string& db_name,
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
const UpdateManifestForFilesStateOptions& opts = {});
|
|
|
|
// ****************************************************************************
|
|
// EXPERIMENTAL new filtering features
|
|
// ****************************************************************************
|
|
|
|
// KeySegmentsExtractor - A class for splitting a key into meaningful pieces, or
|
|
// "segments" for filtering purposes. We say the first key segment has segment
|
|
// ordinal 0, the second has segment ordinal 1, etc. To simplify satisfying some
|
|
// filtering requirements, the segments must encompass a complete key prefix (or
|
|
// the whole key). There cannot be gaps between segments (though segments are
|
|
// allowed to be essentially unused), and segments cannot overlap.
|
|
//
|
|
// Keys can also be put in "categories" to simplify some configuration and
|
|
// handling. A "legal" key or bound is one that does not return an error (as a
|
|
// special, unused category) from the extractor. It is also allowed for all
|
|
// keys in a category to return an empty sequence of segments.
|
|
//
|
|
// To eliminate a confusing distinction between a segment that is empty vs.
|
|
// "not present" for a particular key, each key is logically assiciated with
|
|
// an infinite sequence of segments, including some infinite tail of 0-length
|
|
// segments. In practice, we only represent a finite sequence that (at least)
|
|
// covers the non-trivial segments.
|
|
//
|
|
// Once in production, the behavior associated with a particular GetId()
|
|
// cannot change. Introduce a new GetId() when introducing new behaviors.
|
|
// See also SstQueryFilterConfigsManager below.
|
|
//
|
|
// This feature hasn't yet been validated with user timestamp.
|
|
//
|
|
// = A SIMPLIFIED MODEL =
|
|
// Let us start with the easiest set of contraints to satisfy with a key
|
|
// segments extractor that generally allows for correct point and range
|
|
// filtering, and add complexity from there. Here we first assume
|
|
// * The column family is using the byte-wise comparator, or reverse byte-wise
|
|
// * A single category is assigned to all keys (by the extractor)
|
|
// * Using simplified criteria for legal segment extraction, the "segment
|
|
// maximal prefix property"
|
|
//
|
|
// SEGMENT MAXIMAL PREFIX PROPERTY: The segment that a byte is assigned to can
|
|
// only depend on the bytes that come before it, not on the byte itself nor
|
|
// anything later including the full length of the key or bound.
|
|
//
|
|
// Equivalently, two keys or bounds must agree on the segment assignment of
|
|
// position i if the two keys share a common byte-wise prefix up to at least
|
|
// position i - 1 (and i is within bounds of both keys).
|
|
//
|
|
// This specifically excludes "all or nothing" segments where it is only
|
|
// included if it reaches a particular width or delimiter. A segment resembling
|
|
// the FixedPrefixTransform would be illegal (without other assumptions); it
|
|
// must be like CappedPrefixTransform.
|
|
//
|
|
// This basically matches the notion of parsing prefix codes (see
|
|
// https://en.wikipedia.org/wiki/Prefix_code) except we have to include any
|
|
// partial segment (code word) at the end whenever an extension to that key
|
|
// might produce a full segment. An example would be parsing UTF-8 into
|
|
// segments corresponding to encoded code points, where any incomplete code
|
|
// at the end must be part of a trailing segment. Note a three-way
|
|
// correspondence between
|
|
// (a) byte-wise ordering of encoded code points, e.g.
|
|
// { D0 98 E2 82 AC }
|
|
// { E2 82 AC D0 98 }
|
|
// (b) lexicographic-then-byte-wise ordering of segments that are each an
|
|
// encoded code point, e.g.
|
|
// {{ D0 98 } { E2 82 AC }}
|
|
// {{ E2 82 AC } { D0 98 }}
|
|
// and (c) lexicographic ordering of the decoded code points, e.g.
|
|
// { U+0418 U+20AC }
|
|
// { U+20AC U+0418 }
|
|
// The correspondence between (a) and (b) is a result of the segment maximal
|
|
// prefix property and is critical for correct application of filters to
|
|
// range queries. The correspondence with (c) is a handy attribute of UTF-8
|
|
// (with no over-long encodings) and might be useful to the application.
|
|
//
|
|
// Example types of key segments that can be freely mixed in any order:
|
|
// * Capped number of bytes or codewords. The number cap for the segment
|
|
// could be the same for all keys or encoded earlier in the key.
|
|
// * Up to *and including* a delimiter byte or codeword.
|
|
// * Any/all remaining bytes to the end of the key, though this implies all
|
|
// subsequent segments will be empty.
|
|
// As part of the segment maximal prefix property, if the segments do not
|
|
// extend to the end of the key, that must be implied by the bytes that are
|
|
// in segments, NOT because the potential contents of a segment were considered
|
|
// incomplete.
|
|
//
|
|
// For example, keys might consist of
|
|
// * Segment 0: Any sequence of bytes up to and including the first ':'
|
|
// character, or the whole key if no ':' is present.
|
|
// * Segment 1: The next four bytes, or less if we reach end of key.
|
|
// * Segment 2: An unsigned byte indicating the number of additional bytes in
|
|
// the segment, and then that many bytes (or less up to the end of the key).
|
|
// * Segment 3: Any/all remaining bytes in the key
|
|
//
|
|
// For an example of what can go wrong, consider using '4' as a delimiter
|
|
// but not including it with the segment leading up to it. Suppose we have
|
|
// these keys and corresponding first segments:
|
|
// "123456" -> "123" (in file 1)
|
|
// "124536" -> "12" (in file 2)
|
|
// "125436" -> "125" (in file 1)
|
|
// Notice how byte-wise comparator ordering of the segments does not follow
|
|
// the ordering of the keys. This means we cannot safely use a filter with
|
|
// a range of segment values for filtering key range queries. For example,
|
|
// we might get a range query for ["123", "125Z") and miss that key "124536"
|
|
// in file 2 is in range because its first segment "12" is out of the range
|
|
// of the first segments on the bounds, "123" and "125". We cannot even safely
|
|
// use this for prefix-like range querying with a Bloom filter on the segments.
|
|
// For a query ["12", "124Z"), segment "12" would likely not match the Bloom
|
|
// filter in file 1 and miss "123456".
|
|
//
|
|
// CATEGORIES: The KeySegmentsExtractor is allowed to place keys in categories
|
|
// so that different parts of the key space can use different filtering
|
|
// strategies. The following property is generally recommended for safe filter
|
|
// applicability
|
|
// * CATEGORY CONTIGUOUSNESS PROPERTY: each category is contiguous in
|
|
// comparator order. In other words, any key between two keys of category c
|
|
// must also be in category c.
|
|
// An alternative to categories when distinct kinds of keys are interspersed
|
|
// is to leave some segments empty when they do not apply to that key.
|
|
// Filters are generally set up to handle an empty segment specially so that
|
|
// it doesn't interfere with tracking accurate ranges on non-empty occurrences
|
|
// of the segment.
|
|
//
|
|
// = BEYOND THE SIMPLIFIED MODEL =
|
|
//
|
|
// DETAILED GENERAL REQUIREMENTS (incl OTHER COMPARATORS): The exact
|
|
// requirements on a key segments extractor depend on whether and how we use
|
|
// filters to answer queries that they cannot answer directly. To understand
|
|
// this, we describe
|
|
// (A) the types of filters in terms of data they represent and can directly
|
|
// answer queries about,
|
|
// (B) the types of read queries that we want to use filters for, and
|
|
// (C) the assumptions that need to be satisfied to connect those two.
|
|
//
|
|
// TYPES OF FILTERS: Although not exhaustive, here are some useful categories
|
|
// of filter data:
|
|
// * Equivalence class filtering - Represents or over-approximates a set of
|
|
// equivalence classes on keys. The size of the representation is roughly
|
|
// proportional to the number of equivalence classes added. Bloom and ribbon
|
|
// filters are examples.
|
|
// * Order-based filtering - Represents one or more subranges of a key space or
|
|
// key segment space. A filter query only requires application of the CF
|
|
// comparator. The size of the representation is roughly proportional to the
|
|
// number of subranges and to the key or segment size. For example, we call a
|
|
// simple filter representing a minimum and a maximum value for a segment a
|
|
// min-max filter.
|
|
//
|
|
// TYPES OF READ QUERIES and their DIRECT FILTERS:
|
|
// * Point query - Whether there {definitely isn't, might be} an entry for a
|
|
// particular key in an SST file (or partition, etc.).
|
|
// The DIRECT FILTER for a point query is an equivalence class filter on the
|
|
// whole key.
|
|
// * Range query - Whether there {definitely isn't, might be} any entries
|
|
// within a lower and upper key bound, in an SST file (or partition, etc.).
|
|
// NOTE: For this disucssion, we ignore the detail of inclusive vs.
|
|
// exclusive bounds by assuming a generalized notion of "bound" (vs. key)
|
|
// that conveniently represents spaces between keys. For details, see
|
|
// https://github.com/facebook/rocksdb/pull/11434
|
|
// The DIRECT FILTER for a range query is an order-based filter on the whole
|
|
// key (non-empty intersection of bounds/keys). Simple minimum and maximum
|
|
// keys for each SST file are automatically provided by metadata and used in
|
|
// the read path for filtering (as well as binary search indexing).
|
|
// PARTITIONING NOTE: SST metadata partitions do not have recorded minimum
|
|
// and maximum keys, so require some special handling for range query
|
|
// filtering. See https://github.com/facebook/rocksdb/pull/12872 etc.
|
|
// * Where clauses - Additional constraints that can be put on range queries.
|
|
// Specifically, a where clause is a tuple <i,j,c,b1,b2> representing that the
|
|
// concatenated sequence of segments from i to j (inclusive) compares between
|
|
// b1 and b2 according to comparator c.
|
|
// EXAMPLE: To represent that segment of ordinal i is equal to s, that would
|
|
// be <i,i,bytewise_comparator,before(s),after(s)>.
|
|
// NOTE: To represent something like segment has a particular prefix, you
|
|
// would need to split the key into more segments appropriately. There is
|
|
// little loss of generality because we can combine adjacent segments for
|
|
// specifying where clauses and implementing filters.
|
|
// The DIRECT FILTER for a where clause is an order-based filter on the same
|
|
// sequence of segments and comparator (non-empty intersection of bounds/keys),
|
|
// or in the special case of an equality clause (see example), an equivalence
|
|
// class filter on the sequence of segments.
|
|
//
|
|
// GENERALIZING FILTERS (INDIRECT):
|
|
// * Point queries can utilize essentially any kind of filter by extracting
|
|
// applicable segments of the query key (if not using whole key) and querying
|
|
// the corresponding equivalence class or trivial range.
|
|
// NOTE: There is NO requirement e.g. that the comparator used by the filter
|
|
// match the CF key comparator or similar. The extractor simply needs to be
|
|
// a pure function that does not return "out of bounds" segments.
|
|
// FOR EXAMPLE, a min-max filter on the 4th segment of keys can also be
|
|
// used for filtering point queries (Get/MultiGet) and could be as
|
|
// effective and much more space efficient than a Bloom filter, depending
|
|
// on the workload.
|
|
//
|
|
// Beyond point queries, we generally expect the key comparator to be a
|
|
// lexicographic / big endian ordering at a high level (or the reverse of that
|
|
// ordering), while each segment can use an arbitrary comparator.
|
|
// FOR EXAMPLE, with a custom key comparator and segments extractor,
|
|
// segment 0 could be a 4-byte unsigned little-endian integer,
|
|
// segment 1 could be an 8-byte signed big-endian integer. This framework
|
|
// requires segment 0 to come before segment 1 in the key and to take
|
|
// precedence in key ordering (i.e. segment 1 order is only consulted when
|
|
// keys are equal in segment 0).
|
|
//
|
|
// * Equivalence class filters can apply to range queries under conditions
|
|
// resembling legacy prefix filtering (prefix_extractor). An equivalence class
|
|
// filter on segments i through j and category set s is applicable to a range
|
|
// query from lb to ub if
|
|
// * All segments through j extracted from lb and ub are equal.
|
|
// NOTE: being in the same filtering equivalence class is insufficient, as
|
|
// that could be unrelated inputs with a hash collision. Here we are
|
|
// omitting details that would formally accommodate comparators in which
|
|
// different bytes can be considered equal.
|
|
// * The categories of lb and ub are in the category set s.
|
|
// * COMMON SEGMENT PREFIX PROPERTY (for all x, y, z; params j, s): if
|
|
// * Keys x and z have equal segments up through ordinal j, and
|
|
// * Keys x and z are in categories in category set s, and
|
|
// * Key y is ordered x < y < z according to the CF comparator,
|
|
// then both
|
|
// * Key y has equal segments up through ordinal j (compared to x and z)
|
|
// * Key y is in a category in category set s
|
|
// (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
|
|
// model.)
|
|
//
|
|
// * Order-based filters on segments (rather than whole key) can apply to range
|
|
// queries (with "whole key" bounds). Specifically, an order-based filter on
|
|
// segments i through j and category set s is applicable to a range query from
|
|
// lb to ub if
|
|
// * All segments through i-1 extracted from lb and ub are equal
|
|
// * The categories of lb and ub are in the category set s.
|
|
// * SEGMENT ORDERING PROPERTY for ordinal i through j, segments
|
|
// comparator c, category set s, for all x, y, and z: if
|
|
// * Keys x and z have equal segments up through ordinal i-1, and
|
|
// * Keys x and z are in categories in category set s, and
|
|
// * Key y is ordered x < y < z according to the CF comparator,
|
|
// then both
|
|
// * The common segment prefix property is satisifed through ordinal i-1
|
|
// and with category set s
|
|
// * x_i..j <= y_i..j <= z_i..j according to segment comparator c, where
|
|
// x_i..j is the concatenation of segments i through j of key x (etc.).
|
|
// (This is implied by the SEGMENT MAXIMAL PREFIX PROPERTY in the simplified
|
|
// model.)
|
|
//
|
|
// INTERESTING EXAMPLES:
|
|
// Consider a segment encoding called BadVarInt1 in which a byte with
|
|
// highest-order bit 1 means "start a new segment". Also consider BadVarInt0
|
|
// which starts a new segment on highest-order bit 0.
|
|
//
|
|
// Configuration: bytewise comp, BadVarInt1 format for segments 0-3 with
|
|
// segment 3 also continuing to the end of the key
|
|
// x = 0x 20 21|82 23|||
|
|
// y = 0x 20 21|82 23 24|85||
|
|
// z = 0x 20 21|82 23|84 25||
|
|
//
|
|
// For i=j=1, this set of keys violate the common segment prefix property and
|
|
// segment ordering property, so can lead to incorrect equivalence class
|
|
// filtering or order-based filtering.
|
|
//
|
|
// Suppose we modify the configuration so that "short" keys (empty in segment
|
|
// 2) are placed in an unfiltered category. In that case, x above doesn't meet
|
|
// the precondition for being limited by segment properties. Consider these
|
|
// keys instead:
|
|
// x = 0x 20 21|82 23 24|85||
|
|
// y = 0x 20 21|82 23 24|85 26|87|
|
|
// z = 0x 20 21|82 23 24|85|86|
|
|
// m = 0x 20 21|82 23 25|85|86|
|
|
// n = 0x 20 21|82 23|84 25||
|
|
//
|
|
// Although segment 1 values might be out of order with key order,
|
|
// re-categorizing the short keys has allowed satisfying the common segment
|
|
// prefix property with j=1 (and with j=0), so we can use equivalence class
|
|
// filters on segment 1, or 0, or 0 to 1. However, violation of the segment
|
|
// ordering property on i=j=1 (see z, m, n) means we can't use order-based.
|
|
//
|
|
// p = 0x 20 21|82 23|84 25 26||
|
|
// q = 0x 20 21|82 23|84 25|86|
|
|
//
|
|
// But keys can still be short from segment 2 to 3, and thus we are violating
|
|
// the common segment prefix property for segment 2 (see n, p, q).
|
|
//
|
|
// Configuration: bytewise comp, BadVarInt0 format for segments 0-3 with
|
|
// segment 3 also continuing to the end of the key. No short key category.
|
|
// x = 0x 80 81|22 83|||
|
|
// y = 0x 80 81|22 83|24 85||
|
|
// z = 0x 80 81|22 83 84|25||
|
|
// m = 0x 80 82|22 83|||
|
|
// n = 0x 80 83|22 84|24 85||
|
|
//
|
|
// Even though this violates the segment maximal prefix property of the
|
|
// simplified model, the common segment prefix property and segment ordering
|
|
// property are satisfied for the various segment ordinals. In broader terms,
|
|
// the usual rule of the delimiter going with the segment before it can be
|
|
// violated if every byte value below some threshold starts a segment. (This
|
|
// has not been formally verified and is not recommended.)
|
|
//
|
|
// Suppose that we are paranoid, however, and decide to place short keys
|
|
// (empty in segment 2) into an unfiltered category. This is potentially a
|
|
// dangerous decision because loss of continuity at least affects the
|
|
// ability to filter on segment 0 (common segment prefix property violated
|
|
// with i=j=0; see z, m, n; m not in category set). Thus, excluding short keys
|
|
// with categories is not a recommended solution either.
|
|
class KeySegmentsExtractor {
|
|
public:
|
|
// The extractor assigns keys to categories so that it is easier to
|
|
// combine distinct (though disjoint) key representations within a single
|
|
// column family while applying different or overlapping filtering
|
|
// configurations to the categories.
|
|
// To enable fast set representation, the user is allowed up to 64
|
|
// categories for assigning to keys with the extractor. The user will
|
|
// likely cast to their own enum type or scalars.
|
|
enum KeyCategory : uint_fast8_t {
|
|
kDefaultCategory = 0,
|
|
kMinCategory = kDefaultCategory,
|
|
// ... (user categories)
|
|
// Can be used for keys ordered before typical keys. Not necessarily an
|
|
// error.
|
|
kReservedLowCategory = 62,
|
|
// Can be used for keys ordered after typical keys. Not necessarily an
|
|
// error.
|
|
kReservedHighCategory = 63,
|
|
kMaxUsableCategory = kReservedHighCategory,
|
|
|
|
// Signals to the caller that an unexpected input or condition has
|
|
// been reached and filter construction should be aborted.
|
|
kErrorCategoryFilterScope = UINT8_MAX - 2,
|
|
kMinErrorCategory = kErrorCategoryFilterScope,
|
|
// Signals to the caller that an unexpected input or condition has
|
|
// been reached and SST construction (and compaction or flush)
|
|
// should be aborted.
|
|
kErrorCategoryFileScope = UINT8_MAX - 1,
|
|
// Signals to the caller that an unexpected input or condition has
|
|
// been reached and the DB should be considered to have reached an
|
|
// invalid state, at least in memory.
|
|
kErrorCategoryDbScope = UINT8_MAX,
|
|
};
|
|
using KeyCategorySet = SmallEnumSet<KeyCategory, kMaxUsableCategory>;
|
|
|
|
// The extractor can process three kinds of key-like inputs
|
|
enum KeyKind {
|
|
// User key, not including user timestamp
|
|
kFullUserKey,
|
|
// An iterator lower bound (inclusive). This should generally be handled
|
|
// the same as a full user key but the distinction might be useful for
|
|
// diagnostics or assertions.
|
|
kInclusiveLowerBound,
|
|
// An iterator upper bound (exclusive). Upper bounds are frequently
|
|
// constructed by incrementing the last byte of a key prefix, and this can
|
|
// affect what should be considered as a segment delimiter.
|
|
kExclusiveUpperBound,
|
|
};
|
|
|
|
// The extractor result
|
|
struct Result {
|
|
// Positions in the key (or bound) that represent boundaries
|
|
// between segments, or the exclusive end of each segment. For example, if
|
|
// the key is "abc|123|xyz" then following the guidance of including
|
|
// delimiters with the preceding segment, segment_ends would be {4, 8, 11},
|
|
// representing segments "abc|" "123|" and "xyz". Empty segments are
|
|
// naturally represented with repeated values, as in {4, 8, 8} for
|
|
// "abc|123|", though {4, 8} would be logically equivalent because an
|
|
// infinite sequence of 0-length segments is assumed after what is
|
|
// explicitly represented here. However, segments might not reach the end
|
|
// the key (no automatic last segment to the end of the key) and that is
|
|
// OK for the WEAK ordering property.
|
|
//
|
|
// The first segment automatically starts at key position 0. The only way
|
|
// to put gaps between segments of interest is to assign those gaps to
|
|
// numbered segments, which can be left unused.
|
|
std::vector<uint32_t> segment_ends;
|
|
|
|
// A category to assign to the key or bound. This default may be kept,
|
|
// such as to put all keys into a single category.
|
|
// IMPORTANT CURRENT LIMITATION from above: each category must be
|
|
// contiguous in key comparator order, so any key between two keys in
|
|
// category c must also be in category c. (Typically the category will be
|
|
// determined by segment 0 in some way, often the first byte.) The enum
|
|
// scalar values do not need to be related to key order.
|
|
KeyCategory category = kDefaultCategory;
|
|
|
|
void Reset() {
|
|
segment_ends.clear();
|
|
category = kDefaultCategory;
|
|
}
|
|
};
|
|
|
|
virtual ~KeySegmentsExtractor() {}
|
|
|
|
// A class name for this extractor. See also expectations in GetId().
|
|
virtual const char* Name() const = 0;
|
|
|
|
// An identifying string that is permanently associated with the behavior
|
|
// of this extractor. If a behavior change is made or set in the constructor,
|
|
// the id must change to avoid incorrect filtering behavior on DBs using a
|
|
// previous version of the extractor.
|
|
virtual std::string GetId() const {
|
|
// The default implementation assumes no configuration variance in the
|
|
// constructor and just returns the class name.
|
|
return Name();
|
|
}
|
|
|
|
// Populates the extraction result and returns OK. Error can be signaled
|
|
// with `kError` pseudo-categories. This function is expected to generate
|
|
// non-error results (though possibly empty) on all keys or bounds expected
|
|
// to be encountered by the DB. RocksDB will always call the function with
|
|
// a (pointer to a) default-initialized result object.
|
|
virtual void Extract(const Slice& key_or_bound, KeyKind kind,
|
|
Result* result) const = 0;
|
|
};
|
|
|
|
// Constructs a KeySegmentsExtractor for fixed-width key segments that safely
|
|
// handles short keys by truncating segments at the end of the input key.
|
|
// See comments on KeySegmentsExtractor for why this is much safer for
|
|
// filtering than "all or nothing" fixed-size segments. This is essentially
|
|
// a generalization of (New)CappedPrefixTransform.
|
|
std::shared_ptr<const KeySegmentsExtractor>
|
|
MakeSharedCappedKeySegmentsExtractor(const std::vector<size_t>& byte_widths);
|
|
|
|
// Alternatives for filtering inputs
|
|
|
|
// An individual key segment.
|
|
struct SelectKeySegment {
|
|
// Segments are numbered starting from 0.
|
|
explicit SelectKeySegment(uint16_t _segment_index)
|
|
: segment_index(_segment_index) {}
|
|
uint16_t segment_index;
|
|
};
|
|
|
|
// A range of key segments concatenated together. No concatenation operations
|
|
// are needed, however, because with no gaps between segments, a range of
|
|
// segments is a simple substring of the key.
|
|
struct SelectKeySegmentRange {
|
|
// Segments are numbered starting from 0. Range is inclusive.
|
|
explicit SelectKeySegmentRange(uint8_t _from_segment_index,
|
|
uint8_t _to_segment_index)
|
|
: from_segment_index(_from_segment_index),
|
|
to_segment_index(_to_segment_index) {}
|
|
// Inclusive
|
|
uint8_t from_segment_index;
|
|
// Inclusive
|
|
uint8_t to_segment_index;
|
|
};
|
|
|
|
// User key without timestamp
|
|
struct SelectWholeKey {};
|
|
|
|
// TODO: The remaining Select* are not yet supported
|
|
// As generated by prefix_extractor
|
|
struct SelectLegacyKeyPrefix {};
|
|
|
|
struct SelectUserTimestamp {};
|
|
|
|
struct SelectColumnName {};
|
|
|
|
// NOTE: more variants might be added in the future.
|
|
// NOTE2: filtering on values is not supported because it could easily break
|
|
// overwrite semantics. (Filter out SST with newer, non-matching value but
|
|
// see obsolete value that does match.)
|
|
using FilterInput =
|
|
std::variant<SelectWholeKey, SelectKeySegment, SelectKeySegmentRange,
|
|
SelectLegacyKeyPrefix, SelectUserTimestamp, SelectColumnName>;
|
|
|
|
// Base class for individual filtering schemes in terms of chosen
|
|
// FilterInputs, but not tied to a particular KeySegmentsExtractor.
|
|
//
|
|
// Not user extensible, name sometimes shortened to SQFC
|
|
class SstQueryFilterConfig {
|
|
public:
|
|
virtual ~SstQueryFilterConfig() {}
|
|
};
|
|
|
|
// A filtering scheme that stores minimum and maximum values (according
|
|
// to bytewise ordering) of the specified filter input. Because the
|
|
// empty string is often a special case, the filter excludes that from the
|
|
// min/max computation and keeps a separate boolean for whether empty is
|
|
// present.
|
|
//
|
|
// The filter is also limited to the specified categories, ignoring entries
|
|
// outside the given set of categories. If not All, ranges can only be
|
|
// filtered if upper and lower bounds are in the same category (and that
|
|
// category is in the set relevant to the filter).
|
|
std::shared_ptr<SstQueryFilterConfig> MakeSharedBytewiseMinMaxSQFC(
|
|
FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
|
|
KeySegmentsExtractor::KeyCategorySet::All());
|
|
|
|
std::shared_ptr<SstQueryFilterConfig> MakeSharedReverseBytewiseMinMaxSQFC(
|
|
FilterInput select, KeySegmentsExtractor::KeyCategorySet categories =
|
|
KeySegmentsExtractor::KeyCategorySet::All());
|
|
|
|
// TODO: more kinds of filters, eventually including Bloom/ribbon filters
|
|
// and replacing the old filter configuration APIs
|
|
|
|
// Represents a complete strategy for representing filters in SST files
|
|
// and applying them to optimize range and point queries by excluding
|
|
// irrelevant SST files (as best we can). This is a set of filtering
|
|
// schemes and a KeySegmentsExtractor. For performance, a single extractor
|
|
// should be implemented to meet all the filtering needs of any given
|
|
// column family. KeySegmentsExtractor and FilterInput should be flexible
|
|
// enough that there is no loss of generality, e.g. with leaving segments
|
|
// blank and using segment ranges.
|
|
struct SstQueryFilterConfigs {
|
|
std::vector<std::shared_ptr<SstQueryFilterConfig>> filters;
|
|
std::shared_ptr<const KeySegmentsExtractor> extractor;
|
|
|
|
// Whether this object represent an empty set of configs because no
|
|
// applicable configurations were found. (This case is represented by
|
|
// an internal singleton instance.)
|
|
bool IsEmptyNotFound() const;
|
|
};
|
|
|
|
// SstQueryFilterConfigsManager provides facilities for safe and effective
|
|
// filtering version management, with simple dynamic upgrade/downgrade
|
|
// support. It is designed to encourage a development pattern that
|
|
// minimizes the risk of filter and extractor versioning bugs.
|
|
//
|
|
// SstQueryFilterConfigsManager is essentially an immutable mapping
|
|
// from {config_name_string, version_number} -> SstQueryFilterConfigs
|
|
// for some contiguous range of version numbers. It is also a starting
|
|
// point for specifying which configuration should be used, with awareness
|
|
// of other configurations that might already be persisted in SST files
|
|
// or switched to dynamically.
|
|
//
|
|
// Background: a single codebase might have many use cases and for
|
|
// each use case, a sequence of past, current, and future filtering
|
|
// configurations. It is common for future configurations to be
|
|
// implemented before automatically deployed in order to ensure that
|
|
// a DB can be effectively opened and operated on by a recent older code
|
|
// version. And it is common to maintain a reasonable history of past
|
|
// configurations to ensure smooth upgrade path and proper handling of
|
|
// older SST files that haven't been compacted recently.
|
|
//
|
|
// It would be possible to make SstQueryFilterConfigs dynamically
|
|
// configurable through strings, but that would encourage deployment
|
|
// of ad-hoc, under-tested configurations.
|
|
//
|
|
// Solution: the {config_name_string, version_number} -> SstQueryFilterConfigs
|
|
// mapping in SstQueryFilterConfigsManager formalizes the history (past and
|
|
// future) of approved/tested configurations for a given use case. Filter
|
|
// configurations are kept paired with extractors that they make sense with.
|
|
//
|
|
// The version numbers are "global" to the SstQueryFilterConfigsManager so
|
|
// that it is simple to determine whether a particular code version supports
|
|
// a particular filtering version, regardless of which use case. Numbering
|
|
// always starts with 1, as 0 is reserved for selecting a "no filters"
|
|
// configuration
|
|
//
|
|
// Consider an example initialized with this Data:
|
|
//
|
|
// SstQueryFilterConfigsManager::Data data = {
|
|
// {1, {{"foo", foo_configs}}},
|
|
// {2, {{"bar", bar_configs},
|
|
// {"baz", baz_configs}}},
|
|
// {3, {{"bar", bar_configs_v2}}},
|
|
// };
|
|
//
|
|
// For example, MakeSharedFactory(..., "baz", 1) will use a default empty
|
|
// config, while both MakeSharedFactory(..., "baz", 2) and
|
|
// MakeSharedFactory(..., "baz", 3) select `baz_configs`. Selecting version
|
|
// >= 4 is rejected because those configurations are not known to this
|
|
// version of the code.
|
|
//
|
|
// For correct operation, existing versions should be treated as immutable
|
|
// (once committed to where they could enter production). For example, an
|
|
// update to "baz" should be done in a new version (4), not by amending to
|
|
// version 3. Note also (from before) that the behavior of named extractors
|
|
// must not change, so changes to key segment extraction should introduce
|
|
// new named extractors while keeping the old in the older configs.
|
|
//
|
|
// It is possible to eventually remove the oldest versions, as long as
|
|
// * You won't be rolling back to that version.
|
|
// * You don't have any SST files using an extractor that is only available
|
|
// in that version (and prior).
|
|
// * For each use case and version you might roll back to, you aren't removing
|
|
// the configuration in effect for that version. In our example above, we
|
|
// cannot simply remove version 1 because that would change the configuration
|
|
// of "foo" at version 2. Moving {"foo", foo_configs} to version 2 would be
|
|
// an acceptable work-around for retiring version 1.
|
|
// * There are no gaps in the version numbers specified. Even if you completely
|
|
// retire a use case and want to remove relevant code, you still need to keep
|
|
// an explicit mapping, even if it's empty, as in `{3, {}}` if retiring "bar".
|
|
//
|
|
// Internally, the SstQueryFilterConfigsManager greatly simplifies lifetime
|
|
// management for relevant objects such as KeySegmentExtractors, and provides
|
|
// a lighter weight (and less troublesome) mechanism for relevant named object
|
|
// look-up vs. ObjectRegistry. If following the guidelines above, any extractor
|
|
// referenced in a read SST file should also be referenced by the
|
|
// SstQueryFilterConfigsManager.
|
|
//
|
|
// Not user extensible
|
|
class SstQueryFilterConfigsManager
|
|
: public std::enable_shared_from_this<SstQueryFilterConfigsManager> {
|
|
public:
|
|
using FilteringVersion = uint32_t;
|
|
using NamedConfigs = std::pair<std::string, SstQueryFilterConfigs>;
|
|
using Data =
|
|
std::vector<std::pair<FilteringVersion, std::vector<NamedConfigs>>>;
|
|
|
|
static Status MakeShared(const Data& data,
|
|
std::shared_ptr<SstQueryFilterConfigsManager>* out);
|
|
|
|
virtual ~SstQueryFilterConfigsManager() {}
|
|
|
|
// EXPERIMENTAL/TEMPORARY: hook into table properties for persisting
|
|
// filters and table_filter for applying to range queries.
|
|
|
|
class Factory : public TablePropertiesCollectorFactory {
|
|
public:
|
|
// Modify the target filtering version for new filters. Returns
|
|
// non-OK if the version is not supported. Thread-safe.
|
|
virtual Status SetFilteringVersion(FilteringVersion ver) = 0;
|
|
virtual FilteringVersion GetFilteringVersion() const = 0;
|
|
|
|
// The configs_name used to create this Factory. Immutable.
|
|
virtual const std::string& GetConfigsName() const = 0;
|
|
|
|
// The relevant configs from the SstQueryFilterConfigsManager for
|
|
// the ConfigsName and FilteringVersion.
|
|
virtual const SstQueryFilterConfigs& GetConfigs() const = 0;
|
|
|
|
// The buffers pointed to by the Slices must live as long as any read
|
|
// operations using this table filter function.
|
|
// Can read and process any filters created under this
|
|
// SstQueryFilterConfigsManager but is most efficient when using the
|
|
// same KeySegmentExtractor as this Factory's configs.
|
|
// (That performance optimization is the only reason this function is here
|
|
// rather than in SstQueryFilterConfigsManager.)
|
|
virtual std::function<bool(const TableProperties&)>
|
|
GetTableFilterForRangeQuery(Slice lower_bound_incl,
|
|
Slice upper_bound_excl) const = 0;
|
|
};
|
|
|
|
// Returns OK and creates a Factory as long as `ver` is in the
|
|
// supported range or 0 (always empty/not found). If the particular
|
|
// config_name is not found under that version, then
|
|
// factory->GetConfigs().IsEmptyNotFound() will be true. Such a factory can
|
|
// read filters but will not write any filters.
|
|
virtual Status MakeSharedFactory(const std::string& configs_name,
|
|
FilteringVersion ver,
|
|
std::shared_ptr<Factory>* out) const = 0;
|
|
};
|
|
|
|
} // namespace experimental
|
|
} // namespace ROCKSDB_NAMESPACE
|