mirror of
https://github.com/XRPLF/rippled.git
synced 2026-04-29 15:37:57 +00:00
fix: TSAN data-race fixes and sanitizer CI configuration
- Fix TSAN data races in CoroTaskRunner (atomic operations, mutex guards) - Enable TSAN CI builds with proper ucontext support - Add TSAN suppressions for pre-existing rippled issues - Remove -fno-pie flags for sanitizer compatibility Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -135,7 +135,9 @@ inline JobQueue::CoroTaskRunner::~CoroTaskRunner()
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
join();
|
||||
XRPL_ASSERT(finished_, "xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
|
||||
XRPL_ASSERT(
|
||||
finished_.load(std::memory_order_acquire),
|
||||
"xrpl::JobQueue::CoroTaskRunner::~CoroTaskRunner : is finished");
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -307,7 +309,7 @@ JobQueue::CoroTaskRunner::resume()
|
||||
detail::getLocalValues().reset(saved);
|
||||
if (task_.done())
|
||||
{
|
||||
finished_ = true;
|
||||
finished_.store(true, std::memory_order_release);
|
||||
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
|
||||
// Use std::move (not task_ = {}) so task_.handle_ is null BEFORE the
|
||||
// frame is destroyed. operator= would destroy the frame while handle_
|
||||
@@ -321,14 +323,16 @@ JobQueue::CoroTaskRunner::resume()
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the coroutine has not yet run to completion
|
||||
* @return true if the coroutine has not yet run to completion.
|
||||
*
|
||||
* Uses the atomic finished_ flag instead of reading task_ directly,
|
||||
* because task_ is modified in resume() under mutex_ and reading it
|
||||
* here without a lock would be a data race visible to TSAN.
|
||||
*/
|
||||
inline bool
|
||||
JobQueue::CoroTaskRunner::runnable() const
|
||||
{
|
||||
// After normal completion, task_ is reset to break the shared_ptr cycle
|
||||
// (handle_ becomes null). A null handle means the coroutine is done.
|
||||
return task_.handle() && !task_.done();
|
||||
return !finished_.load(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -339,11 +343,11 @@ JobQueue::CoroTaskRunner::runnable() const
|
||||
inline void
|
||||
JobQueue::CoroTaskRunner::expectEarlyExit()
|
||||
{
|
||||
if (!finished_)
|
||||
if (!finished_.load(std::memory_order_acquire))
|
||||
{
|
||||
std::lock_guard lock(jq_.m_mutex);
|
||||
--jq_.nSuspend_;
|
||||
finished_ = true;
|
||||
finished_.store(true, std::memory_order_release);
|
||||
}
|
||||
// Break the shared_ptr cycle: frame -> shared_ptr<runner> -> this.
|
||||
// The coroutine is at initial_suspend and never ran user code, so
|
||||
@@ -368,7 +372,7 @@ inline void
|
||||
JobQueue::CoroTaskRunner::join()
|
||||
{
|
||||
std::unique_lock<std::mutex> lk(mutex_run_);
|
||||
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_; });
|
||||
cv_.wait(lk, [this]() { return runCount_ == 0 || finished_.load(std::memory_order_acquire); });
|
||||
}
|
||||
|
||||
} // namespace xrpl
|
||||
|
||||
@@ -307,7 +307,10 @@ public:
|
||||
// called. Asserted in the destructor (debug) to catch leaked
|
||||
// runners. Available in all builds to guard expectEarlyExit()
|
||||
// against double-decrementing nSuspend_.
|
||||
bool finished_ = false;
|
||||
// Atomic to allow lock-free reads from runnable(), join(), and
|
||||
// the destructor without requiring the same mutex that guards
|
||||
// the write in resume().
|
||||
std::atomic<bool> finished_{false};
|
||||
|
||||
public:
|
||||
/**
|
||||
@@ -693,15 +696,14 @@ template <class F>
|
||||
std::shared_ptr<JobQueue::CoroTaskRunner>
|
||||
JobQueue::postCoroTask(JobType t, std::string const& name, F&& f)
|
||||
{
|
||||
// Reject if the JQ is shutting down — matches addJob()'s stopping_ check.
|
||||
// Must check before incrementing nSuspend_ to avoid leaving an orphan
|
||||
// count that would cause stop() to hang.
|
||||
if (stopping_)
|
||||
return nullptr;
|
||||
|
||||
// Account for the initial suspension (CoroTask uses lazy start).
|
||||
// Reject if the JQ is shutting down and atomically increment
|
||||
// nSuspend_ under the same lock. Without the lock, a TOCTOU race
|
||||
// exists: stopping_ could become true between the check and the
|
||||
// increment, leaving an orphan nSuspend_ that causes stop() to hang.
|
||||
{
|
||||
std::lock_guard lock(m_mutex);
|
||||
if (stopping_)
|
||||
return nullptr;
|
||||
++nSuspend_;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user