Summary: Add file size validation in ReadFooterFromFile function. Deprecate skip_checking_sst_file_sizes_on_db_open option. This change is used to address this issue https://github.com/facebook/rocksdb/issues/13619 It supports file size validation in ReadFooterFromFile. In favor of this change, CheckConsistency function and skip_checking_sst_file_sizes_on_db_open flag are deprecated. The CheckConsistency function checks each file size matches what was recorded in manifest during DB open. Meantime, ReadFooterFromFile was called for each file in LoadTables function. Since ReadFooterFromFile always validates file size, the CheckConsistency is redundant. In addtion, CheckConsistency is executed in a single thread. This could slow down DB open when a network file system is used. Therefore, the flag skip_checking_sst_file_sizes_on_db_open was added to skip this check. After this change, ReadFooterFromFile was executed in parallel through multiple threads. Therefore, the concern of DB open slowness is eliminated, and the flag could be deprecated. When paranoid check flag is set to true, corrupted file will fail to open the DB. When paranoid check flag is set to false, DB will still be able to open, the healthy ones can be accessed, while the corrupted ones not. There is 2 slight concerns of this change. *If max_open_files is set with smaller value, engine will not open all the files during DB open. This means if there is a corruption on file size, it will not be detected during DB open, but rather at a later time. Since the default is -1, which means open all the files, and it is rarely overridden and a lot of new features rely on it to be -1, the risk is very low. *If FIFO compaction is used, engine could fail to open DB unnecessarily on the corrupted files that would never be used again. However, this is a very rare case as well. The error could still be ignored by setting paranoid_checks operationally. The risk is very low. To remain backward compatibility. The public facing flag was kept and marked as no-op internally. Another change is required to fully remove the flag. Pull Request resolved: https://github.com/facebook/rocksdb/pull/13676 Test Plan: make check A new unit test was added to validate file size check API works as expected. Reviewed By: pdillinger Differential Revision: D76168033 Pulled By: xingbowang fbshipit-source-id: 8ceacf39bcfe02ff7aa289868c341366ee9f3a8e
508 lines
17 KiB
C++
508 lines
17 KiB
C++
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
//
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
#pragma once
|
|
|
|
#include <stdint.h>
|
|
#include <windows.h>
|
|
|
|
#include <mutex>
|
|
#include <string>
|
|
|
|
#include "rocksdb/file_system.h"
|
|
#include "rocksdb/status.h"
|
|
#include "util/aligned_buffer.h"
|
|
#include "util/string_util.h"
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
namespace port {
|
|
|
|
std::string GetWindowsErrSz(DWORD err);
|
|
|
|
inline IOStatus IOErrorFromWindowsError(const std::string& context, DWORD err) {
|
|
return ((err == ERROR_HANDLE_DISK_FULL) || (err == ERROR_DISK_FULL))
|
|
? IOStatus::NoSpace(context, GetWindowsErrSz(err))
|
|
: ((err == ERROR_FILE_NOT_FOUND) || (err == ERROR_PATH_NOT_FOUND))
|
|
? IOStatus::PathNotFound(context, GetWindowsErrSz(err))
|
|
: IOStatus::IOError(context, GetWindowsErrSz(err));
|
|
}
|
|
|
|
inline IOStatus IOErrorFromLastWindowsError(const std::string& context) {
|
|
return IOErrorFromWindowsError(context, GetLastError());
|
|
}
|
|
|
|
inline IOStatus IOError(const std::string& context, int err_number) {
|
|
return (err_number == ENOSPC)
|
|
? IOStatus::NoSpace(context, errnoStr(err_number).c_str())
|
|
: (err_number == ENOENT)
|
|
? IOStatus::PathNotFound(context, errnoStr(err_number).c_str())
|
|
: IOStatus::IOError(context, errnoStr(err_number).c_str());
|
|
}
|
|
|
|
class WinFileData;
|
|
|
|
IOStatus pwrite(const WinFileData* file_data, const Slice& data,
|
|
uint64_t offset, size_t& bytes_written);
|
|
|
|
IOStatus pread(const WinFileData* file_data, char* src, size_t num_bytes,
|
|
uint64_t offset, size_t& bytes_read);
|
|
|
|
IOStatus fallocate(const std::string& filename, HANDLE hFile, uint64_t to_size);
|
|
|
|
IOStatus ftruncate(const std::string& filename, HANDLE hFile, uint64_t toSize);
|
|
|
|
size_t GetUniqueIdFromFile(HANDLE hFile, char* id, size_t max_size);
|
|
|
|
class WinFileData {
|
|
protected:
|
|
const std::string filename_;
|
|
HANDLE hFile_;
|
|
// If true, the I/O issued would be direct I/O which the buffer
|
|
// will need to be aligned (not sure there is a guarantee that the buffer
|
|
// passed in is aligned).
|
|
const bool use_direct_io_;
|
|
const size_t sector_size_;
|
|
|
|
public:
|
|
// We want this class be usable both for inheritance (prive
|
|
// or protected) and for containment so __ctor and __dtor public
|
|
WinFileData(const std::string& filename, HANDLE hFile, bool direct_io);
|
|
|
|
virtual ~WinFileData() { this->CloseFile(); }
|
|
|
|
bool CloseFile() {
|
|
bool result = true;
|
|
|
|
if (hFile_ != NULL && hFile_ != INVALID_HANDLE_VALUE) {
|
|
result = ::CloseHandle(hFile_);
|
|
assert(result);
|
|
hFile_ = NULL;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
const std::string& GetName() const { return filename_; }
|
|
|
|
HANDLE GetFileHandle() const { return hFile_; }
|
|
|
|
bool use_direct_io() const { return use_direct_io_; }
|
|
|
|
size_t GetSectorSize() const { return sector_size_; }
|
|
|
|
bool IsSectorAligned(const size_t off) const;
|
|
|
|
WinFileData(const WinFileData&) = delete;
|
|
WinFileData& operator=(const WinFileData&) = delete;
|
|
};
|
|
|
|
class WinSequentialFile : protected WinFileData, public FSSequentialFile {
|
|
// Override for behavior change when creating a custom env
|
|
virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
|
|
uint64_t offset,
|
|
size_t& bytes_read) const;
|
|
|
|
public:
|
|
WinSequentialFile(const std::string& fname, HANDLE f,
|
|
const FileOptions& options);
|
|
|
|
~WinSequentialFile();
|
|
|
|
WinSequentialFile(const WinSequentialFile&) = delete;
|
|
WinSequentialFile& operator=(const WinSequentialFile&) = delete;
|
|
|
|
IOStatus Read(size_t n, const IOOptions& options, Slice* result,
|
|
char* scratch, IODebugContext* dbg) override;
|
|
IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* dbg) override;
|
|
|
|
IOStatus Skip(uint64_t n) override;
|
|
|
|
IOStatus InvalidateCache(size_t offset, size_t length) override;
|
|
|
|
bool use_direct_io() const override { return WinFileData::use_direct_io(); }
|
|
};
|
|
|
|
// mmap() based random-access
|
|
class WinMmapReadableFile : private WinFileData, public FSRandomAccessFile {
|
|
HANDLE hMap_;
|
|
|
|
const void* mapped_region_;
|
|
const size_t length_;
|
|
|
|
public:
|
|
// mapped_region_[0,length-1] contains the mmapped contents of the file.
|
|
WinMmapReadableFile(const std::string& fileName, HANDLE hFile, HANDLE hMap,
|
|
const void* mapped_region, size_t length);
|
|
|
|
~WinMmapReadableFile();
|
|
|
|
WinMmapReadableFile(const WinMmapReadableFile&) = delete;
|
|
WinMmapReadableFile& operator=(const WinMmapReadableFile&) = delete;
|
|
|
|
IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* dbg) const override;
|
|
|
|
IOStatus InvalidateCache(size_t offset, size_t length) override;
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override;
|
|
|
|
IOStatus GetFileSize(uint64_t* file_size) override;
|
|
};
|
|
|
|
// We preallocate and use memcpy to append new
|
|
// data to the file. This is safe since we either properly close the
|
|
// file before reading from it, or for log files, the reading code
|
|
// knows enough to skip zero suffixes.
|
|
class WinMmapFile : private WinFileData, public FSWritableFile {
|
|
private:
|
|
HANDLE hMap_;
|
|
|
|
const size_t page_size_; // We flush the mapping view in page_size
|
|
// increments. We may decide if this is a memory
|
|
// page size or SSD page size
|
|
const size_t
|
|
allocation_granularity_; // View must start at such a granularity
|
|
|
|
size_t reserved_size_; // Preallocated size
|
|
|
|
size_t mapping_size_; // The max size of the mapping object
|
|
// we want to guess the final file size to minimize the remapping
|
|
size_t view_size_; // How much memory to map into a view at a time
|
|
|
|
char* mapped_begin_; // Must begin at the file offset that is aligned with
|
|
// allocation_granularity_
|
|
char* mapped_end_;
|
|
char* dst_; // Where to write next (in range [mapped_begin_,mapped_end_])
|
|
char* last_sync_; // Where have we synced up to
|
|
|
|
uint64_t file_offset_; // Offset of mapped_begin_ in file
|
|
|
|
// Do we have unsynced writes?
|
|
bool pending_sync_;
|
|
|
|
// Can only truncate or reserve to a sector size aligned if
|
|
// used on files that are opened with Unbuffered I/O
|
|
IOStatus TruncateFile(uint64_t toSize);
|
|
|
|
IOStatus UnmapCurrentRegion();
|
|
|
|
IOStatus MapNewRegion(const IOOptions& options, IODebugContext* dbg);
|
|
|
|
virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
|
|
|
|
public:
|
|
WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
|
|
size_t allocation_granularity, const FileOptions& options);
|
|
|
|
~WinMmapFile();
|
|
|
|
WinMmapFile(const WinMmapFile&) = delete;
|
|
WinMmapFile& operator=(const WinMmapFile&) = delete;
|
|
|
|
IOStatus Append(const Slice& data, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
IOStatus Append(const Slice& data, const IOOptions& opts,
|
|
const DataVerificationInfo& /* verification_info */,
|
|
IODebugContext* dbg) override {
|
|
return Append(data, opts, dbg);
|
|
}
|
|
|
|
// Means Close() will properly take care of truncate
|
|
// and it does not need any additional information
|
|
IOStatus Truncate(uint64_t size, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
|
|
IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
// Flush only data
|
|
IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
/**
|
|
* Flush data as well as metadata to stable storage.
|
|
*/
|
|
IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
/**
|
|
* Get the size of valid data in the file. This will not match the
|
|
* size that is returned from the filesystem because we use mmap
|
|
* to extend file by map_size every time.
|
|
*/
|
|
uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus InvalidateCache(size_t offset, size_t length) override;
|
|
|
|
IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override;
|
|
};
|
|
|
|
class WinRandomAccessImpl {
|
|
protected:
|
|
WinFileData* file_base_;
|
|
size_t alignment_;
|
|
|
|
// Override for behavior change when creating a custom env
|
|
virtual IOStatus PositionedReadInternal(char* src, size_t numBytes,
|
|
uint64_t offset,
|
|
size_t& bytes_read) const;
|
|
|
|
WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
|
|
const FileOptions& options);
|
|
|
|
virtual ~WinRandomAccessImpl() {}
|
|
|
|
IOStatus ReadImpl(uint64_t offset, size_t n, Slice* result,
|
|
char* scratch) const;
|
|
|
|
size_t GetAlignment() const { return alignment_; }
|
|
|
|
public:
|
|
WinRandomAccessImpl(const WinRandomAccessImpl&) = delete;
|
|
WinRandomAccessImpl& operator=(const WinRandomAccessImpl&) = delete;
|
|
};
|
|
|
|
// pread() based random-access
|
|
class WinRandomAccessFile
|
|
: private WinFileData,
|
|
protected WinRandomAccessImpl, // Want to be able to override
|
|
// PositionedReadInternal
|
|
public FSRandomAccessFile {
|
|
public:
|
|
WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
|
const FileOptions& options);
|
|
|
|
~WinRandomAccessFile();
|
|
|
|
IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* dbg) const override;
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override;
|
|
|
|
bool use_direct_io() const override { return WinFileData::use_direct_io(); }
|
|
|
|
IOStatus InvalidateCache(size_t offset, size_t length) override;
|
|
|
|
size_t GetRequiredBufferAlignment() const override;
|
|
|
|
IOStatus GetFileSize(uint64_t* file_size) override;
|
|
};
|
|
|
|
// This is a sequential write class. It has been mimicked (as others) after
|
|
// the original Posix class. We add support for unbuffered I/O on windows as
|
|
// well
|
|
// we utilize the original buffer as an alignment buffer to write directly to
|
|
// file with no buffering.
|
|
// No buffering requires that the provided buffer is aligned to the physical
|
|
// sector size (SSD page size) and
|
|
// that all SetFilePointer() operations to occur with such an alignment.
|
|
// We thus always write in sector/page size increments to the drive and leave
|
|
// the tail for the next write OR for Close() at which point we pad with zeros.
|
|
// No padding is required for
|
|
// buffered access.
|
|
class WinWritableImpl {
|
|
protected:
|
|
WinFileData* file_data_;
|
|
const uint64_t alignment_;
|
|
uint64_t
|
|
next_write_offset_; // Needed because Windows does not support O_APPEND
|
|
uint64_t reservedsize_; // how far we have reserved space
|
|
|
|
virtual IOStatus PreallocateInternal(uint64_t spaceToReserve);
|
|
|
|
WinWritableImpl(WinFileData* file_data, size_t alignment);
|
|
|
|
~WinWritableImpl() {}
|
|
|
|
uint64_t GetAlignment() const { return alignment_; }
|
|
|
|
IOStatus AppendImpl(const Slice& data);
|
|
|
|
// Requires that the data is aligned as specified by
|
|
// GetRequiredBufferAlignment()
|
|
IOStatus PositionedAppendImpl(const Slice& data, uint64_t offset);
|
|
|
|
IOStatus TruncateImpl(uint64_t size);
|
|
|
|
IOStatus CloseImpl();
|
|
|
|
IOStatus SyncImpl(const IOOptions& options, IODebugContext* dbg);
|
|
|
|
uint64_t GetFileNextWriteOffset() {
|
|
// Double accounting now here with WritableFileWriter
|
|
// and this size will be wrong when unbuffered access is used
|
|
// but tests implement their own writable files and do not use
|
|
// WritableFileWrapper
|
|
// so we need to squeeze a square peg through
|
|
// a round hole here.
|
|
return next_write_offset_;
|
|
}
|
|
|
|
IOStatus AllocateImpl(uint64_t offset, uint64_t len);
|
|
|
|
public:
|
|
WinWritableImpl(const WinWritableImpl&) = delete;
|
|
WinWritableImpl& operator=(const WinWritableImpl&) = delete;
|
|
};
|
|
|
|
class WinWritableFile : private WinFileData,
|
|
protected WinWritableImpl,
|
|
public FSWritableFile {
|
|
public:
|
|
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
|
size_t capacity, const FileOptions& options);
|
|
|
|
~WinWritableFile();
|
|
|
|
IOStatus Append(const Slice& data, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
IOStatus Append(const Slice& data, const IOOptions& opts,
|
|
const DataVerificationInfo& /* verification_info */,
|
|
IODebugContext* dbg) override {
|
|
return Append(data, opts, dbg);
|
|
}
|
|
|
|
// Requires that the data is aligned as specified by
|
|
// GetRequiredBufferAlignment()
|
|
IOStatus PositionedAppend(const Slice& data, uint64_t offset,
|
|
const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
IOStatus PositionedAppend(const Slice& data, uint64_t offset,
|
|
const IOOptions& opts,
|
|
const DataVerificationInfo& /* verification_info */,
|
|
IODebugContext* dbg) override {
|
|
return PositionedAppend(data, offset, opts, dbg);
|
|
}
|
|
|
|
// Need to implement this so the file is truncated correctly
|
|
// when buffered and unbuffered mode
|
|
IOStatus Truncate(uint64_t size, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
|
|
IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
// write out the cached data to the OS cache
|
|
// This is now taken care of the WritableFileWriter
|
|
IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
bool IsSyncThreadSafe() const override;
|
|
|
|
// Indicates if the class makes use of direct I/O
|
|
// Use PositionedAppend
|
|
bool use_direct_io() const override;
|
|
|
|
size_t GetRequiredBufferAlignment() const override;
|
|
|
|
uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override;
|
|
};
|
|
|
|
class WinRandomRWFile : private WinFileData,
|
|
protected WinRandomAccessImpl,
|
|
protected WinWritableImpl,
|
|
public FSRandomRWFile {
|
|
public:
|
|
WinRandomRWFile(const std::string& fname, HANDLE hFile, size_t alignment,
|
|
const FileOptions& options);
|
|
|
|
~WinRandomRWFile() {}
|
|
|
|
// Indicates if the class makes use of direct I/O
|
|
// If false you must pass aligned buffer to Write()
|
|
bool use_direct_io() const override;
|
|
|
|
// Use the returned alignment value to allocate aligned
|
|
// buffer for Write() when use_direct_io() returns true
|
|
size_t GetRequiredBufferAlignment() const override;
|
|
|
|
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
|
|
// Pass aligned buffer when use_direct_io() returns true.
|
|
IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options,
|
|
IODebugContext* dbg) override;
|
|
|
|
// Read up to `n` bytes starting from offset `offset` and store them in
|
|
// result, provided `scratch` size should be at least `n`.
|
|
// Returns Status::OK() on success.
|
|
IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
|
|
Slice* result, char* scratch,
|
|
IODebugContext* dbg) const override;
|
|
|
|
IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
|
|
return Sync(options, dbg);
|
|
}
|
|
|
|
IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
|
|
};
|
|
|
|
class WinMemoryMappedBuffer : public MemoryMappedFileBuffer {
|
|
private:
|
|
HANDLE file_handle_;
|
|
HANDLE map_handle_;
|
|
|
|
public:
|
|
WinMemoryMappedBuffer(HANDLE file_handle, HANDLE map_handle, void* base,
|
|
size_t size)
|
|
: MemoryMappedFileBuffer(base, size),
|
|
file_handle_(file_handle),
|
|
map_handle_(map_handle) {}
|
|
~WinMemoryMappedBuffer() override;
|
|
};
|
|
|
|
class WinDirectory : public FSDirectory {
|
|
const std::string filename_;
|
|
HANDLE handle_;
|
|
|
|
public:
|
|
explicit WinDirectory(const std::string& filename, HANDLE h) noexcept
|
|
: filename_(filename), handle_(h) {
|
|
assert(handle_ != INVALID_HANDLE_VALUE);
|
|
}
|
|
~WinDirectory() {
|
|
if (handle_ != NULL) {
|
|
IOStatus s = WinDirectory::Close(IOOptions(), nullptr);
|
|
s.PermitUncheckedError();
|
|
}
|
|
}
|
|
const std::string& GetName() const { return filename_; }
|
|
IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override;
|
|
IOStatus Close(const IOOptions& options, IODebugContext* dbg) override;
|
|
|
|
size_t GetUniqueId(char* id, size_t max_size) const override;
|
|
};
|
|
|
|
class WinFileLock : public FileLock {
|
|
public:
|
|
explicit WinFileLock(HANDLE hFile) : hFile_(hFile) {
|
|
assert(hFile != NULL);
|
|
assert(hFile != INVALID_HANDLE_VALUE);
|
|
}
|
|
|
|
~WinFileLock();
|
|
|
|
private:
|
|
HANDLE hFile_;
|
|
};
|
|
} // namespace port
|
|
} // namespace ROCKSDB_NAMESPACE
|