General purpose function to retry on database timeout

This commit is contained in:
CJ Cobb
2022-01-25 20:10:02 +00:00
parent da96608feb
commit bc131f666a
7 changed files with 117 additions and 136 deletions

View File

@@ -27,17 +27,7 @@ std::optional<LedgerRange>
BackendInterface::hardFetchLedgerRangeNoThrow() const
{
BOOST_LOG_TRIVIAL(debug) << __func__;
while (true)
{
try
{
return hardFetchLedgerRange();
}
catch (DatabaseTimeout& t)
{
;
}
}
return retryOnTimeout([&]() { return hardFetchLedgerRange(); });
}
// *** state data methods
std::optional<Blob>

View File

@@ -5,6 +5,8 @@
#include <backend/DBHelpers.h>
#include <backend/SimpleCache.h>
#include <backend/Types.h>
#include <thread>
#include <type_traits>
namespace Backend {
class DatabaseTimeout : public std::exception
@@ -16,6 +18,25 @@ class DatabaseTimeout : public std::exception
}
};
template <class F>
auto
retryOnTimeout(F func, size_t waitMs = 500)
{
while (true)
{
try
{
return func();
}
catch (DatabaseTimeout& t)
{
std::this_thread::sleep_for(std::chrono::milliseconds(waitMs));
BOOST_LOG_TRIVIAL(error)
<< __func__ << " function timed out. Retrying ... ";
}
}
}
class BackendInterface
{
protected:

View File

@@ -690,48 +690,39 @@ CassandraBackend::doOnlineDelete(uint32_t numLedgersToKeep) const
std::optional<ripple::uint256> cursor;
while (true)
{
try
{
auto [objects, curCursor, warning] =
fetchLedgerPage(cursor, minLedger, 256);
if (warning)
{
BOOST_LOG_TRIVIAL(warning)
<< __func__
<< " online delete running but flag ledger is not complete";
std::this_thread::sleep_for(std::chrono::seconds(10));
continue;
}
for (auto& obj : objects)
{
++numOutstanding;
cbs.push_back(makeAndExecuteBulkAsyncWrite(
this,
std::make_tuple(
std::move(obj.key), minLedger, std::move(obj.blob)),
bind,
numOutstanding,
mtx,
cv));
std::unique_lock<std::mutex> lck(mtx);
BOOST_LOG_TRIVIAL(trace) << __func__ << "Got the mutex";
cv.wait(lck, [&numOutstanding, concurrentLimit]() {
return numOutstanding < concurrentLimit;
});
}
BOOST_LOG_TRIVIAL(debug) << __func__ << " fetched a page";
cursor = curCursor;
if (!cursor)
break;
}
catch (DatabaseTimeout const& e)
auto [objects, curCursor, warning] = retryOnTimeout(
[&]() { return fetchLedgerPage(cursor, minLedger, 256); });
if (warning)
{
BOOST_LOG_TRIVIAL(warning)
<< __func__ << " Database timeout fetching keys";
std::this_thread::sleep_for(std::chrono::seconds(2));
<< __func__
<< " online delete running but flag ledger is not complete";
std::this_thread::sleep_for(std::chrono::seconds(10));
continue;
}
for (auto& obj : objects)
{
++numOutstanding;
cbs.push_back(makeAndExecuteBulkAsyncWrite(
this,
std::make_tuple(
std::move(obj.key), minLedger, std::move(obj.blob)),
bind,
numOutstanding,
mtx,
cv));
std::unique_lock<std::mutex> lck(mtx);
BOOST_LOG_TRIVIAL(trace) << __func__ << "Got the mutex";
cv.wait(lck, [&numOutstanding, concurrentLimit]() {
return numOutstanding < concurrentLimit;
});
}
BOOST_LOG_TRIVIAL(debug) << __func__ << " fetched a page";
cursor = curCursor;
if (!cursor)
break;
}
std::unique_lock<std::mutex> lck(mtx);
cv.wait(lck, [&numOutstanding]() { return numOutstanding == 0; });

View File

@@ -720,38 +720,29 @@ PostgresBackend::doOnlineDelete(uint32_t numLedgersToKeep) const
std::optional<ripple::uint256> cursor;
while (true)
{
try
auto [objects, curCursor, warning] = retryOnTimeout(
[&]() { return fetchLedgerPage(cursor, minLedger, 256); });
if (warning)
{
auto [objects, curCursor, warning] =
fetchLedgerPage(cursor, minLedger, 256);
if (warning)
{
BOOST_LOG_TRIVIAL(warning) << __func__
<< " online delete running but "
"flag ledger is not complete";
std::this_thread::sleep_for(std::chrono::seconds(10));
continue;
}
BOOST_LOG_TRIVIAL(debug) << __func__ << " fetched a page";
std::stringstream objectsBuffer;
BOOST_LOG_TRIVIAL(warning) << __func__
<< " online delete running but "
"flag ledger is not complete";
std::this_thread::sleep_for(std::chrono::seconds(10));
continue;
}
BOOST_LOG_TRIVIAL(debug) << __func__ << " fetched a page";
std::stringstream objectsBuffer;
for (auto& obj : objects)
{
objectsBuffer << "\\\\x" << ripple::strHex(obj.key) << '\t'
<< std::to_string(minLedger) << '\t' << "\\\\x"
<< ripple::strHex(obj.blob) << '\n';
}
pgQuery.bulkInsert("objects", objectsBuffer.str());
cursor = curCursor;
if (!cursor)
break;
}
catch (DatabaseTimeout const& e)
for (auto& obj : objects)
{
BOOST_LOG_TRIVIAL(warning)
<< __func__ << " Database timeout fetching keys";
std::this_thread::sleep_for(std::chrono::seconds(2));
objectsBuffer << "\\\\x" << ripple::strHex(obj.key) << '\t'
<< std::to_string(minLedger) << '\t' << "\\\\x"
<< ripple::strHex(obj.blob) << '\n';
}
pgQuery.bulkInsert("objects", objectsBuffer.str());
cursor = curCursor;
if (!cursor)
break;
}
BOOST_LOG_TRIVIAL(info) << __func__ << " finished inserting into objects";
{

View File

@@ -131,37 +131,23 @@ ReportingETL::publishLedger(ripple::LedgerInfo const& lgrInfo)
{
BOOST_LOG_TRIVIAL(debug)
<< __func__ << " - Publishing ledger " << std::to_string(lgrInfo.seq);
if (!writing_)
{
BOOST_LOG_TRIVIAL(debug) << __func__ << " - Updating cache";
auto diff = backend_->fetchLedgerDiff(lgrInfo.seq);
auto diff = Backend::retryOnTimeout(
[&]() { return backend_->fetchLedgerDiff(lgrInfo.seq); });
backend_->cache().update(diff, lgrInfo.seq);
}
backend_->updateRange(lgrInfo.seq);
auto fees = Backend::retryOnTimeout(
[&]() { return backend_->fetchFees(lgrInfo.seq); });
auto transactions = Backend::retryOnTimeout(
[&]() { return backend_->fetchAllTransactionsInLedger(lgrInfo.seq); });
auto ledgerRange = backend_->fetchLedgerRange();
std::optional<ripple::Fees> fees;
std::vector<Backend::TransactionAndMetadata> transactions;
while (true)
{
try
{
fees = backend_->fetchFees(lgrInfo.seq);
transactions = backend_->fetchAllTransactionsInLedger(lgrInfo.seq);
break;
}
catch (Backend::DatabaseTimeout const&)
{
BOOST_LOG_TRIVIAL(warning) << "Read timeout fetching transactions";
}
}
if (!fees || !ledgerRange)
{
BOOST_LOG_TRIVIAL(error)
<< __func__ << " - could not fetch from database";
return;
}
assert(ledgerRange);
assert(fees);
std::string range = std::to_string(ledgerRange->minSequence) + "-" +
std::to_string(ledgerRange->maxSequence);
@@ -172,7 +158,7 @@ ReportingETL::publishLedger(ripple::LedgerInfo const& lgrInfo)
subscriptions_->pubTransaction(txAndMeta, lgrInfo);
setLastPublish();
BOOST_LOG_TRIVIAL(debug)
BOOST_LOG_TRIVIAL(info)
<< __func__ << " - Published ledger " << std::to_string(lgrInfo.seq);
}
@@ -187,43 +173,37 @@ ReportingETL::publishLedger(
size_t numAttempts = 0;
while (!stopping_)
{
try
{
auto range = backend_->hardFetchLedgerRangeNoThrow();
auto range = backend_->hardFetchLedgerRangeNoThrow();
if (!range || range->maxSequence < ledgerSequence)
if (!range || range->maxSequence < ledgerSequence)
{
BOOST_LOG_TRIVIAL(debug) << __func__ << " : "
<< "Trying to publish. Could not find "
"ledger with sequence = "
<< ledgerSequence;
// We try maxAttempts times to publish the ledger, waiting one
// second in between each attempt.
if (maxAttempts && numAttempts >= maxAttempts)
{
BOOST_LOG_TRIVIAL(debug) << __func__ << " : "
<< "Trying to publish. Could not find "
"ledger with sequence = "
<< ledgerSequence;
// We try maxAttempts times to publish the ledger, waiting one
// second in between each attempt.
if (maxAttempts && numAttempts >= maxAttempts)
{
BOOST_LOG_TRIVIAL(debug)
<< __func__ << " : "
<< "Failed to publish ledger after " << numAttempts
<< " attempts.";
return false;
}
std::this_thread::sleep_for(std::chrono::seconds(1));
++numAttempts;
continue;
<< "Failed to publish ledger after "
<< numAttempts << " attempts.";
return false;
}
else
{
auto lgr = backend_->fetchLedgerBySequence(ledgerSequence);
assert(lgr);
publishLedger(*lgr);
return true;
}
}
catch (Backend::DatabaseTimeout const& e)
{
std::this_thread::sleep_for(std::chrono::seconds(1));
++numAttempts;
continue;
}
else
{
auto lgr = Backend::retryOnTimeout([&]() {
return backend_->fetchLedgerBySequence(ledgerSequence);
});
assert(lgr);
publishLedger(*lgr);
return true;
}
}
return false;
}
@@ -678,9 +658,12 @@ ReportingETL::runETLPipeline(uint32_t startSequence, int numExtractors)
// success is false if the ledger was already written
if (success)
{
/*
boost::asio::post(publishStrand_, [this, lgrInfo = lgrInfo]() {
publishLedger(lgrInfo);
});
*/
backend_->updateRange(lgrInfo.seq);
lastPublishedSequence = lgrInfo.seq;
}
writeConflict = !success;

View File

@@ -232,8 +232,11 @@ SubscriptionManager::pubTransaction(
auto amount = tx->getFieldAmount(ripple::sfTakerGets);
if (account != amount.issue().account)
{
auto ownerFunds =
RPC::accountFunds(*backend_, lgrInfo.seq, amount, account);
auto ownerFunds = Backend::retryOnTimeout([&]() {
return RPC::accountFunds(
*backend_, lgrInfo.seq, amount, account);
});
pubObj["transaction"].as_object()["owner_funds"] =
ownerFunds.getText();
}

View File

@@ -282,6 +282,8 @@ public:
catch (Backend::DatabaseTimeout const& t)
{
BOOST_LOG_TRIVIAL(error) << __func__ << " Database timeout";
// TODO this should be a diff error code. Rippled probably
// does not have an analagous error code
return sendError(RPC::Error::rpcNOT_READY);
}
}