fix: TSAN data-race fixes and sanitizer CI configuration

- Fix TSAN data races in CoroTaskRunner (atomic operations, mutex guards)
- Enable TSAN CI builds with proper ucontext support
- Add TSAN suppressions for pre-existing rippled issues
- Remove -fno-pie flags for sanitizer compatibility

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Pratik Mankawde
2026-03-25 14:15:49 +00:00
parent b53df32334
commit 01fe13dd54
9 changed files with 182 additions and 130 deletions

View File

@@ -135,7 +135,9 @@ inline JobQueue::CoroTaskRunner::~CoroTaskRunner()
{
#ifndef NDEBUG
join();
XRPL_ASSERT(finished_, "xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
XRPL_ASSERT(
finished_.load(std::memory_order_acquire),
"xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
#endif
}
@@ -307,7 +309,7 @@ JobQueue::CoroTaskRunner::resume()
detail::getLocalValues().reset(saved);
if (task_.done())
{
finished_ = true;
finished_.store(true, std::memory_order_release);
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
// Use std::move (not task_ = {}) so task_.handle_ is null BEFORE the
// frame is destroyed. operator= would destroy the frame while handle_
@@ -321,14 +323,16 @@ JobQueue::CoroTaskRunner::resume()
}
/**
* @return true if the coroutine has not yet run to completion
* @return true if the coroutine has not yet run to completion.
*
* Uses the atomic finished_ flag instead of reading task_ directly,
* because task_ is modified in resume() under mutex_ and reading it
* here without a lock would be a data race visible to TSAN.
*/
inline bool
JobQueue::CoroTaskRunner::runnable() const
{
// After normal completion, task_ is reset to break the shared_ptr cycle
// (handle_ becomes null). A null handle means the coroutine is done.
return task_.handle() && !task_.done();
return !finished_.load(std::memory_order_acquire);
}
/**
@@ -339,11 +343,11 @@ JobQueue::CoroTaskRunner::runnable() const
inline void
JobQueue::CoroTaskRunner::expectEarlyExit()
{
if (!finished_)
if (!finished_.load(std::memory_order_acquire))
{
std::lock_guard lock(jq_.m_mutex);
--jq_.nSuspend_;
finished_ = true;
finished_.store(true, std::memory_order_release);
}
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
// The coroutine is at initial_suspend and never ran user code, so
@@ -368,7 +372,7 @@ inline void
JobQueue::CoroTaskRunner::join()
{
std::unique_lock<std::mutex> lk(mutex_run_);
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_; });
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_.load(std::memory_order_acquire); });
}
} // namespace xrpl

View File

@@ -307,7 +307,10 @@ public:
// called. Asserted in the destructor (debug) to catch leaked
// runners. Available in all builds to guard expectEarlyExit()
// against double-decrementing nSuspend_.
bool finished_ = false;
// Atomic to allow lock-free reads from runnable(), join(), and
// the destructor without requiring the same mutex that guards
// the write in resume().
std::atomic<bool> finished_{false};
public:
/**
@@ -693,15 +696,14 @@ template <class F>
std::shared_ptr<JobQueue::CoroTaskRunner>
JobQueue::postCoroTask(JobType t, std::string const& name, F&& f)
{
// Reject if the JQ is shutting down — matches addJob()'s stopping_ check.
// Must check before incrementing nSuspend_ to avoid leaving an orphan
// count that would cause stop() to hang.
if (stopping_)
return nullptr;
// Account for the initial suspension (CoroTask uses lazy start).
// Reject if the JQ is shutting down and atomically increment
// nSuspend_ under the same lock. Without the lock, a TOCTOU race
// exists: stopping_ could become true between the check and the
// increment, leaving an orphan nSuspend_ that causes stop() to hang.
{
std::lock_guard lock(m_mutex);
if (stopping_)
return nullptr;
++nSuspend_;
}