rocksdb/port/stack_trace.cc
Xingbo Wang 91f227d9be Add injected error log ring buffer for fault injection diagnostics (#14431)
Summary:
Add a circular ring buffer (InjectedErrorLog) to FaultInjectionTestFS that records the last 1000 injected errors. Each entry captures the timestamp, thread ID, FS API name with all arguments (file path, offset, buffer size, first 8 bytes of data in hex), and the injected error status. For example:

  Append("/path/035354.sst", size=4096, head=[1a 2b 3c ...]) -> IOError (Retryable): injected write error
  RenameFile("/path/tmp.sst", "/path/035355.sst") -> IOError: injected metadata write error
  Read(offset=16384, size=4096) -> IOError (Retryable): injected read error

The ring buffer is printed to a file automatically:
- On any fatal signal (SIGABRT, SIGSEGV, SIGTERM, SIGINT, SIGHUP, SIGFPE, SIGBUS, SIGILL, SIGQUIT, SIGXCPU, SIGXFSZ, SIGSYS) via a registered crash callback
- At the end of db_stress main(), for diagnostic visibility even when the test completes normally

This addresses a key debugging gap: when write fault injection causes secondary failures (e.g., the builder error propagation issue in T257612259), the injected errors were previously completely silent with no logging trail. The ring buffer provides the missing diagnostic context to correlate fault injection with downstream failures.

Changes:
- port/stack_trace.h/.cc: Add RegisterCrashCallback() API; extend InstallStackTraceHandler() to catch all catchable termination signals
- utilities/fault_injection_fs.h: Add InjectedErrorLog class with printf-style Record(), HexHead() for data bytes, and signal-safe PrintAll()
- utilities/fault_injection_fs.cc: Record full API arguments and error status at all 31 fault injection call sites
- db_stress_tool/db_stress_tool.cc: Register crash callback and print ring buffer at end of main()

Pull Request resolved: https://github.com/facebook/rocksdb/pull/14431

Reviewed By: hx235

Differential Revision: D95435430

Pulled By: xingbowang

fbshipit-source-id: 6c18e1b072044575d6c8c3f198070127b0f80608
2026-03-11 18:15:04 -07:00

455 lines
14 KiB
C++

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
#include "port/stack_trace.h"
#if !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \
defined(OS_SOLARIS) || defined(OS_WIN)
// noop
namespace ROCKSDB_NAMESPACE {
namespace port {
void InstallStackTraceHandler() {}
void PrintStack(int /*first_frames_to_skip*/) {}
void PrintAndFreeStack(void* /*callstack*/, int /*num_frames*/) {}
void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) {
return nullptr;
}
void RegisterCrashCallback(CrashCallback /*callback*/) {}
} // namespace port
} // namespace ROCKSDB_NAMESPACE
#else
#include <cxxabi.h>
#include <execinfo.h>
#include <pthread.h>
#include <unistd.h>
#include <csignal>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#ifdef OS_OPENBSD
#include <sys/sysctl.h>
#include <sys/wait.h>
#endif // OS_OPENBSD
#ifdef OS_FREEBSD
#include <sys/sysctl.h>
#include <sys/wait.h>
#endif // OS_FREEBSD
#ifdef OS_LINUX
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/wait.h>
#if __GLIBC__ < 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 30)
#include <sys/syscall.h>
#define gettid() syscall(SYS_gettid)
#endif // GLIBC version
#endif // OS_LINUX
#include <algorithm>
#include <atomic>
#include "port/lang.h"
namespace ROCKSDB_NAMESPACE::port {
namespace {
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || \
defined(OS_GNU_KFREEBSD)
const char* GetExecutableName() {
static char name[1024];
#if defined(OS_FREEBSD)
int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
size_t namesz = sizeof(name);
auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0);
if (-1 == ret) {
return nullptr;
} else {
return name;
}
#elif defined(OS_OPENBSD)
int mib[4] = {CTL_KERN, KERN_PROC_ARGS, getpid(), KERN_PROC_ARGV};
size_t namesz = sizeof(name);
char* bin[namesz];
auto ret = sysctl(mib, 4, bin, &namesz, nullptr, 0);
if (-1 == ret) {
return nullptr;
} else {
return bin[0];
}
#else
char link[1024];
snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
auto read = readlink(link, name, sizeof(name) - 1);
if (-1 == read) {
return nullptr;
} else {
name[read] = 0;
return name;
}
#endif
}
void PrintStackTraceLine(const char* symbol, void* frame) {
static const char* executable = GetExecutableName();
if (symbol) {
fprintf(stderr, "%s ", symbol);
}
if (executable) {
// out source to addr2line, for the address translation
const int kLineMax = 256;
char cmd[kLineMax];
snprintf(cmd, kLineMax, "addr2line %p -e %s -f -C 2>&1", frame, executable);
auto f = popen(cmd, "r");
if (f) {
char line[kLineMax];
while (fgets(line, sizeof(line), f)) {
line[strlen(line) - 1] = 0; // remove newline
fprintf(stderr, "%s\t", line);
}
pclose(f);
}
} else {
fprintf(stderr, " %p", frame);
}
fprintf(stderr, "\n");
}
#elif defined(OS_MACOSX)
void PrintStackTraceLine(const char* symbol, void* frame) {
static int pid = getpid();
// out source to atos, for the address translation
const int kLineMax = 256;
char cmd[kLineMax];
snprintf(cmd, kLineMax, "xcrun atos %p -p %d 2>&1", frame, pid);
auto f = popen(cmd, "r");
if (f) {
char line[kLineMax];
while (fgets(line, sizeof(line), f)) {
line[strlen(line) - 1] = 0; // remove newline
fprintf(stderr, "%s\t", line);
}
pclose(f);
} else if (symbol) {
fprintf(stderr, "%s ", symbol);
}
fprintf(stderr, "\n");
}
#endif
const char* GetLldbScriptSelectThread(long long tid) {
// NOTE: called from a signal handler, so no heap allocation
static char script[80];
snprintf(script, sizeof(script),
"script -l python -- lldb.process.SetSelectedThreadByID(%lld)", tid);
return script;
}
} // namespace
void PrintStack(void* frames[], int num_frames) {
auto symbols = backtrace_symbols(frames, num_frames);
for (int i = 0; i < num_frames; ++i) {
fprintf(stderr, "#%-2d ", i);
PrintStackTraceLine((symbols != nullptr) ? symbols[i] : nullptr, frames[i]);
}
free(symbols);
}
void PrintStack(int first_frames_to_skip) {
// Default to getting stack traces with GDB, at least on Linux where we
// know how to attach to a particular thread.
//
// * Address space layout randomization (ASLR) interferes with getting good
// stack information from backtrace+addr2line. This is more likely to show
// up with LIB_MODE=shared builds (when kernel.randomize_va_space >= 1)
// but can also show up with LIB_MODE=static builds ((when
// kernel.randomize_va_space == 2).
// * It doesn't appear easy to detect when ASLR is in use.
// * With DEBUG_LEVEL < 2, backtrace() can skip frames that are not skipped
// in GDB.
//
// LLDB also available as an option
bool lldb_stack_trace = getenv("ROCKSDB_LLDB_STACK") != nullptr;
#if defined(OS_LINUX)
// Default true, override with ROCKSDB_BACKTRACE_STACK=1
bool gdb_stack_trace =
!lldb_stack_trace && getenv("ROCKSDB_BACKTRACE_STACK") == nullptr;
#else
// Default false, override with ROCKSDB_GDB_STACK=1
bool gdb_stack_trace = getenv("ROCKSDB_GDB_STACK") != nullptr;
#endif
// Also support invoking interactive debugger on stack trace, with this
// envvar set to non-empty
char* debug_env = getenv("ROCKSDB_DEBUG");
bool debug = debug_env != nullptr && strlen(debug_env) > 0;
if (!debug && getenv("ROCKSDB_NO_STACK") != nullptr) {
// Skip stack trace
return;
}
if (lldb_stack_trace || gdb_stack_trace || debug) {
// Allow ouside debugger to attach, even with Yama security restrictions
#ifdef PR_SET_PTRACER_ANY
(void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif
// Try to invoke GDB, either for stack trace or debugging.
long long attach_pid = getpid();
// NOTE: we're in a signal handler, so no heap allocation
char attach_pid_str[20];
snprintf(attach_pid_str, sizeof(attach_pid_str), "%lld", attach_pid);
// `gdb -p PID` seems to always attach to main thread, but `gdb -p TID`
// seems to be able to attach to a particular thread in a process, which
// makes sense as the main thread TID == PID of the process.
// But I haven't found that gdb capability documented anywhere, so leave
// a back door to attach to main thread.
long long gdb_attach_id = attach_pid;
// Save current thread id before fork
long long attach_tid = 0;
#ifdef OS_LINUX
attach_tid = gettid();
if (getenv("ROCKSDB_DEBUG_USE_PID") == nullptr) {
gdb_attach_id = attach_tid;
}
#endif
char gdb_attach_id_str[20];
snprintf(gdb_attach_id_str, sizeof(gdb_attach_id_str), "%lld",
gdb_attach_id);
pid_t child_pid = fork();
if (child_pid == 0) {
// child process
if (debug) {
if (strcmp(debug_env, "lldb") == 0) {
fprintf(stderr, "Invoking LLDB for debugging (ROCKSDB_DEBUG=%s)...\n",
debug_env);
execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str,
/*"-Q",*/ "-o", GetLldbScriptSelectThread(attach_tid),
(char*)nullptr);
return;
} else {
fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n",
debug_env);
execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", gdb_attach_id_str,
(char*)nullptr);
return;
}
} else {
// Redirect child stdout to original stderr
dup2(2, 1);
// No child stdin (don't use pager)
close(0);
if (lldb_stack_trace) {
fprintf(stderr, "Invoking LLDB for stack trace...\n");
// Skip top ~4 frames here in PrintStack
auto bt_in_lldb =
"script -l python -- for f in lldb.thread.frames[4:]: print(f)";
execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str,
"-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o",
bt_in_lldb, (char*)nullptr);
} else {
// gdb_stack_trace
fprintf(stderr, "Invoking GDB for stack trace...\n");
// Skip top ~4 frames here in PrintStack
// See https://stackoverflow.com/q/40991943/454544
auto bt_in_gdb =
"frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 "
"21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 "
"42 43 44 -q frame";
// -n : Loading config files can apparently cause failures with the
// other options here.
// -batch : non-interactive; suppress banners as much as possible
execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p",
gdb_attach_id_str, "-ex", bt_in_gdb, (char*)nullptr);
}
return;
}
} else {
// parent process; wait for child
int wstatus;
waitpid(child_pid, &wstatus, 0);
if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == EXIT_SUCCESS) {
// Good
return;
}
}
fprintf(stderr, "GDB failed; falling back on backtrace+addr2line...\n");
}
const int kMaxFrames = 100;
void* frames[kMaxFrames];
int num_frames = (int)backtrace(frames, kMaxFrames);
PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip);
}
void PrintAndFreeStack(void* callstack, int num_frames) {
PrintStack(static_cast<void**>(callstack), num_frames);
free(callstack);
}
void* SaveStack(int* num_frames, int first_frames_to_skip) {
const int kMaxFrames = 100;
void* frames[kMaxFrames];
int count = (int)backtrace(frames, kMaxFrames);
*num_frames = count - first_frames_to_skip;
void* callstack = malloc(sizeof(void*) * *num_frames);
memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames);
return callstack;
}
static std::atomic<uint64_t> g_thread_handling_stack_trace{0};
static int g_recursion_count = 0;
static std::atomic<bool> g_at_exit_called{false};
static std::atomic<CrashCallback> g_crash_callback{nullptr};
static void StackTraceHandler(int sig) {
fprintf(stderr, "Received signal %d (%s)\n", sig, strsignal(sig));
// Crude recursive mutex with no signal-unsafe system calls, to avoid
// re-entrance from multiple threads and avoid core dumping while trying
// to print the stack trace.
uint64_t tid = 0;
{
const auto ptid = pthread_self();
// pthread_t is an opaque type
memcpy(&tid, &ptid, std::min(sizeof(tid), sizeof(ptid)));
// Essentially ensure non-zero
++tid;
}
for (;;) {
uint64_t expected = 0;
if (g_thread_handling_stack_trace.compare_exchange_strong(expected, tid)) {
// Acquired mutex
g_recursion_count = 0;
break;
}
if (expected == tid) {
++g_recursion_count;
fprintf(stderr, "Recursive call to stack trace handler (%d)\n",
g_recursion_count);
break;
}
// Sleep before trying again
usleep(1000);
}
if (g_recursion_count > 2) {
// Give up after too many recursions
fprintf(stderr, "Too many recursive calls to stack trace handler (%d)\n",
g_recursion_count);
} else {
if (g_at_exit_called.load(std::memory_order_acquire)) {
fprintf(stderr, "In a race with process already exiting...\n");
}
// Invoke registered crash callback before printing stack trace
auto callback = g_crash_callback.load(std::memory_order_acquire);
if (callback) {
callback();
}
// skip the top three signal handler related frames
PrintStack(3);
// Efforts to fix or suppress TSAN warnings "signal-unsafe call inside of
// a signal" have failed, so just warn the user about them.
#ifdef __SANITIZE_THREAD__
fprintf(stderr,
"==> NOTE: any above warnings about \"signal-unsafe call\" are\n"
"==> ignorable, as they are expected when generating a stack\n"
"==> trace because of a signal under TSAN. Consider why the\n"
"==> signal was generated to begin with, and the stack trace\n"
"==> in the TSAN warning can be useful for that. (The stack\n"
"==> trace printed by the signal handler is likely obscured\n"
"==> by TSAN output.)\n");
#endif
}
// reset to default handler
signal(sig, SIG_DFL);
// re-signal to default handler (so we still get core dump if needed...)
raise(sig);
// release the mutex, in case this is somehow recoverable
if (g_recursion_count > 0) {
--g_recursion_count;
} else {
g_thread_handling_stack_trace.store(0, std::memory_order_release);
}
}
static void AtExit() {
// wait for stack trace handler to finish, if needed
while (g_thread_handling_stack_trace.load(std::memory_order_acquire)) {
usleep(1000);
}
g_at_exit_called.store(true, std::memory_order_release);
}
// Lightweight handler for graceful termination signals (SIGTERM, SIGINT).
// Prints the crash callback (e.g., ring buffer) but skips the
// expensive GDB/LLDB stack trace, since these are intentional terminations.
static void TerminationHandler(int sig) {
char buf[64];
int len = snprintf(buf, sizeof(buf), "Received signal %d (%s)\n", sig,
strsignal(sig));
if (len > 0) {
auto unused __attribute__((unused)) = write(STDOUT_FILENO, buf, len);
}
auto callback = g_crash_callback.load(std::memory_order_acquire);
if (callback) {
callback();
}
signal(sig, SIG_DFL);
raise(sig);
}
void RegisterCrashCallback(CrashCallback callback) {
g_crash_callback.store(callback, std::memory_order_release);
}
void InstallStackTraceHandler() {
// just use the plain old signal as it's simple and sufficient
// for this use case
// Crash signals — invoke full stack trace + ring buffer
signal(SIGILL, StackTraceHandler);
signal(SIGSEGV, StackTraceHandler);
signal(SIGBUS, StackTraceHandler);
signal(SIGABRT, StackTraceHandler);
signal(SIGFPE, StackTraceHandler);
signal(SIGQUIT, StackTraceHandler);
// Termination signals — print ring buffer only, no stack trace
signal(SIGTERM, TerminationHandler);
signal(SIGINT, TerminationHandler);
atexit(AtExit);
// Allow ouside debugger to attach, even with Yama security restrictions.
// This is needed even outside of PrintStack() so that external mechanisms
// can dump stacks if they suspect that a test has hung.
#ifdef PR_SET_PTRACER_ANY
(void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
#endif
}
} // namespace ROCKSDB_NAMESPACE::port
#endif