Eliminate SHAMapInnerNode lock contention:

The `SHAMapInnerNode` class had a global mutex to protect the
array of node children. Profiling suggested that around 4% of
all attempts to lock the global would block.

This commit removes that global mutex, and replaces it with a
new per-node 16-way spinlock (implemented so as not to effect
the size of an inner node objet), effectively eliminating the
lock contention.
This commit is contained in:
Nik Bougalis
2021-11-20 00:02:09 -08:00
parent 34ca457132
commit 1b9387eddc
4 changed files with 103 additions and 16 deletions

View File

@@ -27,8 +27,10 @@
#include <ripple/shamap/SHAMapTreeNode.h> #include <ripple/shamap/SHAMapTreeNode.h>
#include <ripple/shamap/impl/TaggedPointer.h> #include <ripple/shamap/impl/TaggedPointer.h>
#include <atomic>
#include <bitset> #include <bitset>
#include <cstdint> #include <cstdint>
#include <limits>
#include <memory> #include <memory>
#include <mutex> #include <mutex>
#include <optional> #include <optional>
@@ -53,7 +55,8 @@ private:
std::uint32_t fullBelowGen_ = 0; std::uint32_t fullBelowGen_ = 0;
std::uint16_t isBranch_ = 0; std::uint16_t isBranch_ = 0;
static std::mutex childLock; /** A bitlock for the children of this node, with one bit per child */
mutable std::atomic<std::uint16_t> lock_ = 0;
/** Convert arrays stored in `hashesAndChildren_` so they can store the /** Convert arrays stored in `hashesAndChildren_` so they can store the
requested number of children. requested number of children.
@@ -155,7 +158,7 @@ public:
std::shared_ptr<SHAMapTreeNode> std::shared_ptr<SHAMapTreeNode>
getChild(int branch); getChild(int branch);
virtual std::shared_ptr<SHAMapTreeNode> std::shared_ptr<SHAMapTreeNode>
canonicalizeChild(int branch, std::shared_ptr<SHAMapTreeNode> node); canonicalizeChild(int branch, std::shared_ptr<SHAMapTreeNode> node);
// sync functions // sync functions

View File

@@ -19,11 +19,9 @@
#include <ripple/shamap/SHAMapInnerNode.h> #include <ripple/shamap/SHAMapInnerNode.h>
#include <ripple/basics/ByteUtilities.h>
#include <ripple/basics/Log.h> #include <ripple/basics/Log.h>
#include <ripple/basics/Slice.h> #include <ripple/basics/Slice.h>
#include <ripple/basics/contract.h> #include <ripple/basics/contract.h>
#include <ripple/basics/safe_cast.h>
#include <ripple/beast/core/LexicalCast.h> #include <ripple/beast/core/LexicalCast.h>
#include <ripple/protocol/HashPrefix.h> #include <ripple/protocol/HashPrefix.h>
#include <ripple/protocol/digest.h> #include <ripple/protocol/digest.h>
@@ -33,14 +31,88 @@
#include <openssl/sha.h> #include <openssl/sha.h>
#include <algorithm> #include <algorithm>
#include <array>
#include <iterator> #include <iterator>
#include <mutex>
#include <utility> #include <utility>
// This is used for the _mm_pause instruction:
#include <immintrin.h>
namespace ripple { namespace ripple {
std::mutex SHAMapInnerNode::childLock; /** A specialized 16-way spinlock used to protect inner node branches.
This class packs 16 separate spinlocks into a single 16-bit value. It makes
it possible to lock any one lock at once or, alternatively, all together.
The implementation tries to use portable constructs but has to be low-level
for performance.
*/
class SpinBitlock
{
private:
std::atomic<std::uint16_t>& bits_;
std::uint16_t mask_;
public:
SpinBitlock(std::atomic<std::uint16_t>& lock) : bits_(lock), mask_(0xFFFF)
{
}
SpinBitlock(std::atomic<std::uint16_t>& lock, int index)
: bits_(lock), mask_(1 << index)
{
assert(index >= 0 && index < 16);
}
[[nodiscard]] bool
try_lock()
{
// If we want to grab all the individual bitlocks at once we cannot
// use `fetch_or`! To see why, imagine that `lock_ == 0x0020` which
// means that the `fetch_or` would return `0x0020` but all the bits
// would already be (incorrectly!) set. Oops!
std::uint16_t expected = 0;
if (mask_ != 0xFFFF)
return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) ==
expected;
return bits_.compare_exchange_weak(
expected,
mask_,
std::memory_order_acquire,
std::memory_order_relaxed);
}
void
lock()
{
// Testing suggests that 99.9999% of the time this will succeed, so
// we try to optimize the fast path.
if (try_lock())
return;
do
{
// We try to spin for a few times:
for (int i = 0; i != 100; ++i)
{
if (try_lock())
return;
_mm_pause();
}
std::this_thread::yield();
} while ((bits_.load(std::memory_order_relaxed) & mask_) == 0);
}
void
unlock()
{
bits_.fetch_and(~mask_, std::memory_order_release);
}
};
SHAMapInnerNode::SHAMapInnerNode( SHAMapInnerNode::SHAMapInnerNode(
std::uint32_t cowid, std::uint32_t cowid,
@@ -108,7 +180,10 @@ SHAMapInnerNode::clone(std::uint32_t cowid) const
cloneHashes[branchNum] = thisHashes[indexNum]; cloneHashes[branchNum] = thisHashes[indexNum];
}); });
} }
std::lock_guard lock(childLock);
SpinBitlock sl(lock_);
std::lock_guard lock(sl);
if (thisIsSparse) if (thisIsSparse)
{ {
int cloneChildIndex = 0; int cloneChildIndex = 0;
@@ -341,8 +416,11 @@ SHAMapInnerNode::getChildPointer(int branch)
assert(branch >= 0 && branch < branchFactor); assert(branch >= 0 && branch < branchFactor);
assert(!isEmptyBranch(branch)); assert(!isEmptyBranch(branch));
std::lock_guard lock(childLock); auto const index = *getChildIndex(branch);
return hashesAndChildren_.getChildren()[*getChildIndex(branch)].get();
SpinBitlock sl(lock_, index);
std::lock_guard lock(sl);
return hashesAndChildren_.getChildren()[index].get();
} }
std::shared_ptr<SHAMapTreeNode> std::shared_ptr<SHAMapTreeNode>
@@ -351,8 +429,11 @@ SHAMapInnerNode::getChild(int branch)
assert(branch >= 0 && branch < branchFactor); assert(branch >= 0 && branch < branchFactor);
assert(!isEmptyBranch(branch)); assert(!isEmptyBranch(branch));
std::lock_guard lock(childLock); auto const index = *getChildIndex(branch);
return hashesAndChildren_.getChildren()[*getChildIndex(branch)];
SpinBitlock sl(lock_, index);
std::lock_guard lock(sl);
return hashesAndChildren_.getChildren()[index];
} }
SHAMapHash const& SHAMapHash const&
@@ -377,7 +458,9 @@ SHAMapInnerNode::canonicalizeChild(
auto [_, hashes, children] = hashesAndChildren_.getHashesAndChildren(); auto [_, hashes, children] = hashesAndChildren_.getHashesAndChildren();
assert(node->getHash() == hashes[childIndex]); assert(node->getHash() == hashes[childIndex]);
std::lock_guard lock(childLock); SpinBitlock sl(lock_, childIndex);
std::lock_guard lock(sl);
if (children[childIndex]) if (children[childIndex])
{ {
// There is already a node hooked up, return it // There is already a node hooked up, return it

View File

@@ -583,7 +583,6 @@ SHAMap::addKnownNode(
} }
auto const generation = f_.getFullBelowCache(ledgerSeq_)->getGeneration(); auto const generation = f_.getFullBelowCache(ledgerSeq_)->getGeneration();
auto newNode = SHAMapTreeNode::makeFromWire(rawNode);
SHAMapNodeID iNodeID; SHAMapNodeID iNodeID;
auto iNode = root_.get(); auto iNode = root_.get();
@@ -612,6 +611,8 @@ SHAMap::addKnownNode(
if (iNode == nullptr) if (iNode == nullptr)
{ {
auto newNode = SHAMapTreeNode::makeFromWire(rawNode);
if (!newNode || childHash != newNode->getHash()) if (!newNode || childHash != newNode->getHash())
{ {
JLOG(journal_.warn()) << "Corrupt node received"; JLOG(journal_.warn()) << "Corrupt node received";

View File

@@ -17,9 +17,9 @@
*/ */
//============================================================================== //==============================================================================
#include <ripple/shamap/impl/TaggedPointer.h> #include <ripple/basics/ByteUtilities.h>
#include <ripple/shamap/SHAMapInnerNode.h> #include <ripple/shamap/SHAMapInnerNode.h>
#include <ripple/shamap/impl/TaggedPointer.h>
#include <array> #include <array>