fix: TSAN data-race fixes and sanitizer CI configuration

- Fix TSAN data races in CoroTaskRunner (atomic operations, mutex guards)
- Enable TSAN CI builds with proper ucontext support
- Add TSAN suppressions for pre-existing rippled issues
- Remove -fno-pie flags for sanitizer compatibility

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-25 14:15:49 +00:00
parent b53df32334
commit 01fe13dd54
9 changed files with 182 additions and 130 deletions

View File

@@ -235,11 +235,16 @@ def generate_strategy_matrix(all: bool, config: Config) -> list:
# so that they are easier to identify in the GitHub Actions UI, as long
# names get truncated.
# Add Address and Thread (both coupled with UB) sanitizers for specific bookworm distros.
# GCC-Asan rippled-embedded tests are failing because of https://github.com/google/sanitizers/issues/856
if (
os["distro_version"] == "bookworm"
and f"{os['compiler_name']}-{os['compiler_version']}" == "clang-20"
):
# Note: GCC ASAN's detect_stack_use_after_return produces false positives with
# Boost.Context fibers (boost::asio::spawn). Mitigated in reusable-build-test-config.yml
# by setting detect_stack_use_after_return=0 for GCC.
# See: https://github.com/google/sanitizers/issues/856
if os[
"distro_version"
] == "bookworm" and f"{os['compiler_name']}-{os['compiler_version']}" in [
"gcc-13",
"clang-17",
]:
# Add ASAN + UBSAN configuration.
configurations.append(
{
@@ -253,19 +258,19 @@ def generate_strategy_matrix(all: bool, config: Config) -> list:
"sanitizers": "address,undefinedbehavior",
}
)
# TSAN is deactivated due to seg faults with latest compilers.
activate_tsan = False
# TSAN is activated on gcc-13 and clang-17.
activate_tsan = True
if activate_tsan:
configurations.append(
{
"config_name": config_name + "-tsan-ubsan",
"config_name": config_name + "-tsan",
"cmake_args": cmake_args,
"cmake_target": cmake_target,
"build_only": build_only,
"build_type": build_type,
"os": os,
"architecture": architecture,
"sanitizers": "thread,undefinedbehavior",
"sanitizers": "thread",
}
)
else:

View File

@@ -231,8 +231,11 @@ jobs:
CONFIG_NAME: ${{ inputs.config_name }}
run: |
ASAN_OPTS="include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-asan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/asan.supp"
# GCC ASAN's detect_stack_use_after_return produces false positives with
# Boost.Context fiber stack switching (used by boost::asio::spawn).
# See: https://github.com/google/sanitizers/issues/856
if [[ "${CONFIG_NAME}" == *gcc* ]]; then
ASAN_OPTS="${ASAN_OPTS}:alloc_dealloc_mismatch=0"
ASAN_OPTS="${ASAN_OPTS}:alloc_dealloc_mismatch=0:detect_stack_use_after_return=0"
fi
echo "ASAN_OPTIONS=${ASAN_OPTS}" >> ${GITHUB_ENV}
echo "TSAN_OPTIONS=include=${GITHUB_WORKSPACE}/sanitizers/suppressions/runtime-tsan-options.txt:suppressions=${GITHUB_WORKSPACE}/sanitizers/suppressions/tsan.supp" >> ${GITHUB_ENV}
@@ -242,10 +245,11 @@ jobs:
- name: Run the separate tests
if: ${{ !inputs.build_only }}
working-directory: ${{ env.BUILD_DIR }}
# Windows locks some of the build files while running tests, and parallel jobs can collide
# Windows locks some of the build files while running tests, and parallel jobs can collide.
# Sanitizer builds use single-threaded execution to reduce memory pressure.
env:
BUILD_TYPE: ${{ inputs.build_type }}
PARALLELISM: ${{ runner.os == 'Windows' && '1' || steps.nproc.outputs.nproc }}
PARALLELISM: ${{ env.SANITIZERS_ENABLED == 'true' && '1' || (runner.os == 'Windows' && '1' || steps.nproc.outputs.nproc) }}
run: |
ctest \
--output-on-failure \
@@ -256,12 +260,21 @@ jobs:
if: ${{ !inputs.build_only }}
working-directory: ${{ runner.os == 'Windows' && format('{0}/{1}', env.BUILD_DIR, inputs.build_type) || env.BUILD_DIR }}
env:
BUILD_NPROC: ${{ steps.nproc.outputs.nproc }}
# Coverage builds are slower due to instrumentation; use fewer parallel jobs to avoid flakiness
NPROC: ${{ steps.nproc.outputs.nproc }}
COVERAGE_ON: ${{ env.COVERAGE_ENABLED }}
SANITIZERS_ON: ${{ env.SANITIZERS_ENABLED }}
run: |
set -o pipefail
# Coverage builds are slower due to instrumentation; use fewer parallel jobs to avoid flakiness
[ "$COVERAGE_ENABLED" = "true" ] && BUILD_NPROC=$(( BUILD_NPROC - 2 ))
./xrpld --unittest --unittest-jobs "${BUILD_NPROC}" 2>&1 | tee unittest.log
if [ "${COVERAGE_ON}" = "true" ]; then
PARALLELISM=$(( NPROC > 2 ? NPROC - 2 : 1 ))
else
PARALLELISM="${NPROC}"
fi
if [ "${SANITIZERS_ON}" = "true" ]; then
PARALLELISM=1
fi
./xrpld --unittest --unittest-jobs "${PARALLELISM}" 2>&1 | tee unittest.log
- name: Show test failure summary
if: ${{ failure() && !inputs.build_only }}

View File

@@ -39,16 +39,21 @@ if(Boost_COMPILER)
target_link_libraries(xrpl_boost INTERFACE Boost::disable_autolinking)
endif()
# Boost.Context's ucontext backend has ASAN fiber-switching annotations
# (start/finish_switch_fiber) that are compiled in when BOOST_USE_ASAN is defined.
# This tells ASAN about fiber stack switches used by boost::asio::spawn,
# preventing false positive stack-use-after-scope errors.
# Boost.Context's ucontext backend has sanitizer fiber-switching annotations
# that are compiled in when BOOST_USE_ASAN/BOOST_USE_TSAN is defined.
# This tells the sanitizer about fiber stack switches used by boost::asio::spawn,
# preventing false positive errors.
# BOOST_USE_UCONTEXT ensures the ucontext backend is selected (fcontext does
# not support ASAN annotations).
# not support sanitizer annotations).
# These defines must match what Boost was compiled with (see conan/profiles/sanitizers).
if(enable_asan)
target_compile_definitions(
xrpl_boost
INTERFACE BOOST_USE_ASAN BOOST_USE_UCONTEXT
)
elseif(enable_tsan)
target_compile_definitions(
xrpl_boost
INTERFACE BOOST_USE_TSAN BOOST_USE_UCONTEXT
)
endif()

View File

@@ -82,5 +82,12 @@ tools.info.package_id:confs+=["tools.build:cxxflags", "tools.build:exelinkflags"
boost/*:without_context=False
# Boost stacktrace fails to build with some sanitizers
boost/*:without_stacktrace=True
{% elif "thread" in sanitizers %}
# Build Boost.Context with ucontext backend (not fcontext) so that
# TSAN can track fiber/context switches made by boost::asio::spawn.
# fcontext uses raw assembly that TSAN cannot instrument.
boost/*:extra_b2_flags=context-impl=ucontext thread-sanitizer=on define=BOOST_USE_TSAN=1
boost/*:without_context=False
boost/*:without_stacktrace=True
{% endif %}
{% endif %}

View File

@@ -98,6 +98,7 @@ words:
- doxyfile
- dxrpl
- endmacro
- eventfd
- exceptioned
- Falco
- fcontext
@@ -268,6 +269,7 @@ words:
- takerpays
- ters
- TMEndpointv2
- TOCTOU
- trixie
- tx
- txid

View File

@@ -135,7 +135,9 @@ inline JobQueue::CoroTaskRunner::~CoroTaskRunner()
{
#ifndef NDEBUG
join();
XRPL_ASSERT(finished_, "xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
XRPL_ASSERT(
finished_.load(std::memory_order_acquire),
"xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
#endif
}
@@ -307,7 +309,7 @@ JobQueue::CoroTaskRunner::resume()
detail::getLocalValues().reset(saved);
if (task_.done())
{
finished_ = true;
finished_.store(true, std::memory_order_release);
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
// Use std::move (not task_ = {}) so task_.handle_ is null BEFORE the
// frame is destroyed. operator= would destroy the frame while handle_
@@ -321,14 +323,16 @@ JobQueue::CoroTaskRunner::resume()
}
/**
* @return true if the coroutine has not yet run to completion
* @return true if the coroutine has not yet run to completion.
*
* Uses the atomic finished_ flag instead of reading task_ directly,
* because task_ is modified in resume() under mutex_ and reading it
* here without a lock would be a data race visible to TSAN.
*/
inline bool
JobQueue::CoroTaskRunner::runnable() const
{
// After normal completion, task_ is reset to break the shared_ptr cycle
// (handle_ becomes null). A null handle means the coroutine is done.
return task_.handle() && !task_.done();
return !finished_.load(std::memory_order_acquire);
}
/**
@@ -339,11 +343,11 @@ JobQueue::CoroTaskRunner::runnable() const
inline void
JobQueue::CoroTaskRunner::expectEarlyExit()
{
if (!finished_)
if (!finished_.load(std::memory_order_acquire))
{
std::lock_guard lock(jq_.m_mutex);
--jq_.nSuspend_;
finished_ = true;
finished_.store(true, std::memory_order_release);
}
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
// The coroutine is at initial_suspend and never ran user code, so
@@ -368,7 +372,7 @@ inline void
JobQueue::CoroTaskRunner::join()
{
std::unique_lock<std::mutex> lk(mutex_run_);
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_; });
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_.load(std::memory_order_acquire); });
}
} // namespace xrpl

View File

@@ -307,7 +307,10 @@ public:
// called. Asserted in the destructor (debug) to catch leaked
// runners. Available in all builds to guard expectEarlyExit()
// against double-decrementing nSuspend_.
bool finished_ = false;
// Atomic to allow lock-free reads from runnable(), join(), and
// the destructor without requiring the same mutex that guards
// the write in resume().
std::atomic<bool> finished_{false};
public:
/**
@@ -693,15 +696,14 @@ template <class F>
std::shared_ptr<JobQueue::CoroTaskRunner>
JobQueue::postCoroTask(JobType t, std::string const& name, F&& f)
{
// Reject if the JQ is shutting down — matches addJob()'s stopping_ check.
// Must check before incrementing nSuspend_ to avoid leaving an orphan
// count that would cause stop() to hang.
if (stopping_)
return nullptr;
// Account for the initial suspension (CoroTask uses lazy start).
// Reject if the JQ is shutting down and atomically increment
// nSuspend_ under the same lock. Without the lock, a TOCTOU race
// exists: stopping_ could become true between the check and the
// increment, leaving an orphan nSuspend_ that causes stop() to hang.
{
std::lock_guard lock(m_mutex);
if (stopping_)
return nullptr;
++nSuspend_;
}

View File

@@ -1,102 +1,79 @@
# The idea is to empty this file gradually by fixing the underlying issues and removing suppresions.
# TSAN suppression file for rippled.
# Only suppress issues in third-party libraries and TSAN's own instrumentation.
# Races in rippled's own code should be fixed, not suppressed.
# Suppress race in Boost ASIO scheduler detected by GCC-15
# This is a false positive in Boost's internal pipe() synchronization
# Boost ASIO / Boost Context false positives
# These are internal to Boost's reactor, fiber context switching, and memory management.
race:boost/asio/
race:boost/context/
race:boost/asio/executor.hpp
race:boost::asio
# Suppress tsan related issues in rippled code.
race:src/libxrpl/basics/make_SSLContext.cpp
race:src/libxrpl/basics/Number.cpp
race:src/libxrpl/json/json_value.cpp
race:src/libxrpl/json/to_string.cpp
race:src/libxrpl/ledger/OpenView.cpp
race:src/libxrpl/net/HTTPClient.cpp
race:src/libxrpl/nodestore/backend/NuDBFactory.cpp
race:src/libxrpl/protocol/InnerObjectFormats.cpp
race:src/libxrpl/protocol/STParsedJSON.cpp
race:src/libxrpl/resource/ResourceManager.cpp
race:src/test/app/Flow_test.cpp
race:src/test/app/LedgerReplay_test.cpp
race:src/test/app/NFToken_test.cpp
race:src/test/app/Offer_test.cpp
race:src/test/app/ValidatorSite_test.cpp
race:src/test/consensus/NegativeUNL_test.cpp
race:src/test/jtx/impl/Env.cpp
race:src/test/jtx/impl/JSONRPCClient.cpp
race:src/test/jtx/impl/pay.cpp
race:src/test/jtx/impl/token.cpp
race:src/test/rpc/Book_test.cpp
race:src/xrpld/app/ledger/detail/InboundTransactions.cpp
race:src/xrpld/app/main/Application.cpp
race:src/xrpld/app/main/BasicApp.cpp
race:src/xrpld/app/main/GRPCServer.cpp
race:src/xrpld/app/misc/detail/AmendmentTable.cpp
race:src/xrpld/app/misc/FeeVoteImpl.cpp
race:src/xrpld/app/rdb/detail/Wallet.cpp
race:src/xrpld/overlay/detail/OverlayImpl.cpp
race:src/xrpld/peerfinder/detail/PeerfinderManager.cpp
race:src/xrpld/peerfinder/detail/SourceStrings.cpp
race:src/xrpld/rpc/detail/ServerHandler.cpp
race:xrpl/server/detail/Door.h
race:xrpl/server/detail/Spawn.h
race:xrpl/server/detail/ServerImpl.h
race:xrpl/nodestore/detail/DatabaseNodeImp.h
race:src/libxrpl/beast/utility/beast_Journal.cpp
race:src/test/beast/LexicalCast_test.cpp
race:ripple::ServerHandler
# More suppressions in external library code.
race:crtstuff.c
race:pipe
# Deadlock / lock-order-inversion suppressions
# Note: GCC's TSAN may not fully support all deadlock suppression patterns
deadlock:src/libxrpl/beast/utility/beast_Journal.cpp
deadlock:src/libxrpl/beast/utility/beast_PropertyStream.cpp
deadlock:src/test/beast/beast_PropertyStream_test.cpp
deadlock:src/xrpld/core/detail/Workers.cpp
deadlock:src/xrpld/app/misc/detail/Manifest.cpp
deadlock:src/xrpld/app/misc/detail/ValidatorList.cpp
deadlock:src/xrpld/app/misc/detail/ValidatorSite.cpp
signal:src/libxrpl/beast/utility/beast_Journal.cpp
signal:src/xrpld/core/detail/Workers.cpp
signal:src/xrpld/core/JobQueue.cpp
signal:ripple::Workers::Worker
# Aggressive suppressing of deadlock tsan errors
deadlock:pthread_create
deadlock:pthread_rwlock_rdlock
deadlock:boost::asio
# Suppress SEGV crashes in TSAN itself during stringbuf operations
# This appears to be a GCC-15 TSAN instrumentation issue with basic_stringbuf::str()
# Commonly triggered in beast::Journal::ScopedStream destructor
signal:std::__cxx11::basic_stringbuf
signal:basic_stringbuf
signal:basic_ostringstream
called_from_lib:libclang_rt
race:ostreambuf_iterator
race:basic_ostream
# Suppress SEGV in Boost ASIO memory allocation with GCC-15 TSAN
signal:boost::asio::aligned_new
signal:boost::asio::detail::memory
# Suppress SEGV in execute_native_thread_routine
signal:execute_native_thread_routine
# Suppress data race in Boost Context fiber management
# This is a false positive in Boost's exception state management during fiber context switching
race:__cxxabiv1::manage_exception_state
race:boost::context::fiber::resume
race:boost::asio::detail::spawned_fiber_thread
race:boost::asio::detail::spawned_fiber_thread::suspend_with
race:boost::asio::detail::spawned_fiber_thread::destroy
# Suppress data race in __tsan_memcpy called from Boost fiber operations
race:__cxxabiv1::manage_exception_state
race:__tsan_memcpy
# TSAN's own syscall interceptors and libc internals
# These are false positives from TSAN's fd tracking in glibc.
# TODO: Tighten these suppressions once TSAN runs cleanly — they may mask
# real fd-lifetime bugs in rippled's networking code.
race:crtstuff.c
race:pipe
race:epoll_ctl
race:epoll_create
race:closedir
race:__close_nocancel
race:__GI___close
race:__socket
race:__GI___accept4
race:eventfd
# C++ standard library internals
race:ostreambuf_iterator
race:basic_ostream
called_from_lib:libclang_rt
# Deadlock false positives in Boost and threading primitives
deadlock:pthread_create
deadlock:pthread_rwlock_rdlock
deadlock:boost::asio
# Pre-existing rippled issues (suppress until fixed, tracked separately)
# TODO(RIPD-XXXX): beast::Journal::Sink::threshold is not thread-safe.
# The severity field is read/written from multiple threads without synchronization.
race:beast::Journal::Sink::threshold
# TODO(RIPD-XXXX): LedgerReplayTask/LedgerDeltaAcquire lock-order-inversion.
# LedgerDeltaAcquire::notify() calls LedgerReplayTask::deltaReady() while
# holding its own mutex, but deltaReady() acquires LedgerReplayTask's mutex,
# creating an A->B / B->A lock ordering cycle.
deadlock:xrpl::LedgerDeltaAcquire
deadlock:xrpl::LedgerReplayTask
# TODO(RIPD-XXXX): SHAMap copy constructor / setLedgerSeq data race.
race:xrpl::SHAMap::SHAMap
race:xrpl::SHAMap::setLedgerSeq
# TODO(RIPD-XXXX): RCLConsensus / NetworkOPsImp lock-order-inversion.
# doLedgerAccept() acquires NetworkOPsImp mutex then RCLConsensus::startRound()
# acquires consensus mutex (M0→M1), while RCLConsensus::Adaptor::doAccept()
# acquires them in reverse order via std::lock (M1→M0). The std::lock path is
# actually safe (deadlock-free), but TSAN flags the inconsistent ordering.
deadlock:xrpl::RCLConsensus
deadlock:xrpl::doLedgerAccept
# TODO(RIPD-XXXX): beast::PropertyStream::Source lock-order-inversion.
# find_one_deep() acquires parent's recursive_mutex then recursively acquires
# children's mutexes, while find_one() can acquire them in reverse order.
# This is a pre-existing issue in beast's PropertyStream tree traversal.
deadlock:beast::PropertyStream::Source
# Signal/crash suppressions for GCC TSAN instrumentation issues
signal:std::__cxx11::basic_stringbuf
signal:basic_stringbuf
signal:basic_ostringstream
signal:boost::asio::aligned_new
signal:boost::asio::detail::memory
signal:execute_native_thread_routine

View File

@@ -44,6 +44,7 @@ namespace test {
* testValueException | CoroTask<T> exception via co_await
* testValueChaining | nested CoroTask<T> -> CoroTask<T>
* testShutdownRejection | postCoroTask returns nullptr when stopping
* testExpectEarlyExit | expectEarlyExit() with finished_ == false
*/
class CoroTask_test : public beast::unit_test::suite
{
@@ -514,6 +515,41 @@ public:
BEAST_EXPECT(!runner);
}
/**
* Exercises expectEarlyExit() when the coroutine has never run
* (finished_ is false). This covers the if-body that decrements
* nSuspend_ and sets finished_ = true.
*/
void
testExpectEarlyExit()
{
using namespace jtx;
testcase("expectEarlyExit with unfinished coroutine");
Env env(*this, envconfig([](std::unique_ptr<Config> cfg) {
cfg->FORCE_MULTI_THREAD = true;
return cfg;
}));
auto& jq = env.app().getJobQueue();
// Create a runner directly (bypassing postCoroTask) so we can
// control the lifecycle and exercise the early-exit path.
auto runner = std::make_shared<JobQueue::CoroTaskRunner>(
JobQueue::CoroTaskRunner::create_t{}, jq, jtCLIENT, "TestEarlyExit");
runner->init([](auto) -> CoroTask<void> { co_return; });
// Simulate the nSuspend_ increment that postCoroTask normally does.
runner->onSuspend();
// expectEarlyExit: finished_ is false, so the if-body runs
// (decrements nSuspend_, sets finished_ = true, destroys frame).
runner->expectEarlyExit();
BEAST_EXPECT(!runner->runnable());
}
void
run() override
{
@@ -528,6 +564,7 @@ public:
testValueException();
testValueChaining();
testShutdownRejection();
testExpectEarlyExit();
}
};