Implement Shards

2025-12-06 17:27:55 +00:00 · 2017-04-25 12:02:51 -04:00
parent aeda2430cd
commit 718d217158
74 changed files with 3992 additions and 1411 deletions
--- a/Builds/VisualStudio2015/RippleD.vcxproj
+++ b/Builds/VisualStudio2015/RippleD.vcxproj
@@ -2275,6 +2275,8 @@
    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\DatabaseRotating.h">
    </ClInclude>
+    <ClInclude Include="..\..\src\ripple\nodestore\DatabaseShard.h">
+    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\DummyScheduler.h">
    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\Factory.h">
@@ -2289,7 +2291,19 @@
    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\impl\codec.h">
    </ClInclude>
-    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseImp.h">
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\Database.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='release|x64'">True</ExcludedFromBuild>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='debug.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='release.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseNodeImp.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='release|x64'">True</ExcludedFromBuild>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='debug.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='release.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseNodeImp.h">
    </ClInclude>
    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseRotatingImp.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
@@ -2299,6 +2313,14 @@
    </ClCompile>
    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseRotatingImp.h">
    </ClInclude>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseShardImp.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='release|x64'">True</ExcludedFromBuild>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='debug.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='release.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseShardImp.h">
+    </ClInclude>
    <ClCompile Include="..\..\src\ripple\nodestore\impl\DecodedBlob.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='release|x64'">True</ExcludedFromBuild>
@@ -2335,6 +2357,14 @@
      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='debug.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='release.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\Shard.cpp">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='debug|x64'">True</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='release|x64'">True</ExcludedFromBuild>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='debug.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='release.classic|x64'">..\..\src\rocksdb2\include;..\..\src\snappy\config;..\..\src\snappy\snappy;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\Shard.h">
+    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\impl\Tuning.h">
    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\impl\varint.h">
--- a/Builds/VisualStudio2015/RippleD.vcxproj.filters
+++ b/Builds/VisualStudio2015/RippleD.vcxproj.filters
@@ -2931,6 +2931,9 @@
    <ClInclude Include="..\..\src\ripple\nodestore\DatabaseRotating.h">
      <Filter>ripple\nodestore</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\ripple\nodestore\DatabaseShard.h">
+      <Filter>ripple\nodestore</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\DummyScheduler.h">
      <Filter>ripple\nodestore</Filter>
    </ClInclude>
@@ -2946,7 +2949,13 @@
    <ClInclude Include="..\..\src\ripple\nodestore\impl\codec.h">
      <Filter>ripple\nodestore\impl</Filter>
    </ClInclude>
-    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseImp.h">
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\Database.cpp">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseNodeImp.cpp">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseNodeImp.h">
      <Filter>ripple\nodestore\impl</Filter>
    </ClInclude>
    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseRotatingImp.cpp">
@@ -2955,6 +2964,12 @@
    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseRotatingImp.h">
      <Filter>ripple\nodestore\impl</Filter>
    </ClInclude>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\DatabaseShardImp.cpp">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\DatabaseShardImp.h">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClInclude>
    <ClCompile Include="..\..\src\ripple\nodestore\impl\DecodedBlob.cpp">
      <Filter>ripple\nodestore\impl</Filter>
    </ClCompile>
@@ -2979,6 +2994,12 @@
    <ClCompile Include="..\..\src\ripple\nodestore\impl\NodeObject.cpp">
      <Filter>ripple\nodestore\impl</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\ripple\nodestore\impl\Shard.cpp">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClCompile>
+    <ClInclude Include="..\..\src\ripple\nodestore\impl\Shard.h">
+      <Filter>ripple\nodestore\impl</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\src\ripple\nodestore\impl\Tuning.h">
      <Filter>ripple\nodestore\impl</Filter>
    </ClInclude>
--- a/doc/rippled-example.cfg
+++ b/doc/rippled-example.cfg
@@ -834,6 +834,33 @@
 #   [import_db]     Settings for performing a one-time import (optional)
 #   [database_path]   Path to the book-keeping databases.
 #
+#   [shard_db]      Settings for the Shard Database (optional)
+#
+#   Format (without spaces):
+#       One or more lines of case-insensitive key / value pairs:
+#       <key> '=' <value>
+#       ...
+#
+#   Example:
+#       type=nudb
+#       path=db/nudb
+#
+#   The "type" field must be present and controls the choice of backend:
+#
+#   type = NuDB
+#
+#   type = RocksDB
+#
+#       The RocksDB backend also provides these optional parameters:
+#
+#       compression         0 for none, 1 for Snappy compression
+#
+#   Required keys:
+#       path                Location to store the database (all types)
+#
+#       max_size_gb         Maximum disk space the database will utilize (in gigabytes)
+#
+#
 #   There are 4 bookkeeping SQLite database that the server creates and
 #   maintains. If you omit this configuration setting, it will default to
 #   creating a directory called "db" located in the same place as your
--- a/src/ripple/app/consensus/RCLConsensus.cpp
+++ b/src/ripple/app/consensus/RCLConsensus.cpp
@@ -36,6 +36,7 @@
 #include <ripple/basics/make_lock.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/consensus/LedgerTiming.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/overlay/Overlay.h>
 #include <ripple/overlay/predicates.h>
 #include <ripple/protocol/Feature.h>
@@ -106,7 +107,7 @@ RCLConsensus::Adaptor::acquireLedger(LedgerHash const& ledger)
            app_.getJobQueue().addJob(
                jtADVANCE, "getConsensusLedger", [app, hash](Job&) {
                    app->getInboundLedgers().acquire(
-                        hash, 0, InboundLedger::fcCONSENSUS);
+                        hash, 0, InboundLedger::Reason::CONSENSUS);
                });
        }
        return boost::none;
@@ -625,9 +626,16 @@ RCLConsensus::Adaptor::notify(
    }
    s.set_firstseq(uMin);
    s.set_lastseq(uMax);
-    app_.overlay().foreach (
-        send_always(std::make_shared<Message>(s, protocol::mtSTATUS_CHANGE)));
-    JLOG(j_.trace()) << "send status change to peer";
+    if (auto shardStore = app_.getShardStore())
+    {
+        auto shards = shardStore->getCompleteShards();
+        if (! shards.empty())
+            s.set_shardseqs(shards);
+    }
+    app_.overlay ().foreach (send_always (
+        std::make_shared <Message> (
+            s, protocol::mtSTATUS_CHANGE)));
+    JLOG (j_.trace()) << "send status change to peer";
 }

 /** Apply a set of transactions to a ledger.
--- a/src/ripple/app/ledger/AccountStateSF.cpp
+++ b/src/ripple/app/ledger/AccountStateSF.cpp
@@ -22,19 +22,13 @@

 namespace ripple {

-AccountStateSF::AccountStateSF(Family& f, AbstractFetchPackContainer& fp)
-    : f_(f)
-    , fp_(fp)
+void
+AccountStateSF::gotNode(bool, SHAMapHash const& nodeHash,
+    std::uint32_t ledgerSeq, Blob&& nodeData,
+    SHAMapTreeNode::TNType) const
 {
-}
-
-void AccountStateSF::gotNode (bool fromFilter,
-                              SHAMapHash const& nodeHash,
-                              Blob&& nodeData,
-                              SHAMapTreeNode::TNType) const
-{
-    f_.db().store(hotACCOUNT_NODE, std::move(nodeData),
-        nodeHash.as_uint256());
+    db_.store(hotACCOUNT_NODE, std::move(nodeData),
+        nodeHash.as_uint256(), ledgerSeq);
 }

 boost::optional<Blob>
--- a/src/ripple/app/ledger/AccountStateSF.h
+++ b/src/ripple/app/ledger/AccountStateSF.h
@@ -21,31 +21,32 @@
 #define RIPPLE_APP_LEDGER_ACCOUNTSTATESF_H_INCLUDED

 #include <ripple/app/ledger/AbstractFetchPackContainer.h>
+#include <ripple/nodestore/Database.h>
 #include <ripple/shamap/SHAMapSyncFilter.h>
-#include <ripple/shamap/Family.h>

 namespace ripple {

 // This class is only needed on add functions
 // sync filter for account state nodes during ledger sync
-class AccountStateSF
-    : public SHAMapSyncFilter
+class AccountStateSF : public SHAMapSyncFilter
 {
-private:
-    Family& f_;
-    AbstractFetchPackContainer& fp_;
-
 public:
-    AccountStateSF(Family&, AbstractFetchPackContainer&);
+    AccountStateSF(NodeStore::Database& db, AbstractFetchPackContainer& fp)
+        : db_(db)
+        , fp_(fp)
+    {}

-    // Note that the nodeData is overwritten by this call
-    void gotNode (bool fromFilter,
-                  SHAMapHash const& nodeHash,
-                  Blob&& nodeData,
-                  SHAMapTreeNode::TNType) const override;
+    void
+    gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+        std::uint32_t ledgerSeq, Blob&& nodeData,
+            SHAMapTreeNode::TNType type) const override;

    boost::optional<Blob>
    getNode(SHAMapHash const& nodeHash) const override;
+
+private:
+    NodeStore::Database& db_;
+    AbstractFetchPackContainer& fp_;
 };

 } // ripple
--- a/src/ripple/app/ledger/ConsensusTransSetSF.cpp
+++ b/src/ripple/app/ledger/ConsensusTransSetSF.cpp
@@ -38,9 +38,9 @@ ConsensusTransSetSF::ConsensusTransSetSF (Application& app, NodeCache& nodeCache
 {
 }

-void ConsensusTransSetSF::gotNode (
-    bool fromFilter, SHAMapHash const& nodeHash,
-    Blob&& nodeData, SHAMapTreeNode::TNType type) const
+void
+ConsensusTransSetSF::gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+    std::uint32_t, Blob&& nodeData, SHAMapTreeNode::TNType type) const
 {
    if (fromFilter)
        return;
--- a/src/ripple/app/ledger/ConsensusTransSetSF.h
+++ b/src/ripple/app/ledger/ConsensusTransSetSF.h
@@ -39,10 +39,10 @@ public:
    ConsensusTransSetSF (Application& app, NodeCache& nodeCache);

    // Note that the nodeData is overwritten by this call
-    void gotNode (bool fromFilter,
-                  SHAMapHash const& nodeHash,
-                  Blob&& nodeData,
-                  SHAMapTreeNode::TNType) const override;
+    void
+    gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+        std::uint32_t ledgerSeq, Blob&& nodeData,
+            SHAMapTreeNode::TNType type) const override;

    boost::optional<Blob>
    getNode (SHAMapHash const& nodeHash) const override;
--- a/src/ripple/app/ledger/InboundLedger.h
+++ b/src/ripple/app/ledger/InboundLedger.h
@@ -44,18 +44,16 @@ public:
        std::shared_ptr<protocol::TMLedgerData>>;

    // These are the reasons we might acquire a ledger
-    enum fcReason
+    enum class Reason
    {
-        fcHISTORY,      // Acquiring past ledger
-        fcGENERIC,      // Generic other reasons
-        fcVALIDATION,   // Validations suggest this ledger is important
-        fcCURRENT,      // This might be the current ledger
-        fcCONSENSUS,    // We believe the consensus round requires this ledger
+        HISTORY,  // Acquiring past ledger
+        SHARD,    // Acquiring for shard
+        GENERIC,  // Generic other reasons
+        CONSENSUS // We believe the consensus round requires this ledger
    };

-public:
-    InboundLedger(Application& app,
-        uint256 const& hash, std::uint32_t seq, fcReason reason, clock_type&);
+    InboundLedger(Application& app, uint256 const& hash,
+        std::uint32_t seq, Reason reason, clock_type&);

    ~InboundLedger ();

@@ -70,15 +68,24 @@ public:
    {
        return mLedger;
    }
+
    std::uint32_t getSeq () const
    {
        return mSeq;
    }

+    Reason
+    getReason() const
+    {
+        return mReason;
+    }
+
    bool checkLocal ();
    void init (ScopedLockType& collectionLock);

-    bool gotData (std::weak_ptr<Peer>, std::shared_ptr<protocol::TMLedgerData>);
+    bool
+    gotData(std::weak_ptr<Peer>,
+        std::shared_ptr<protocol::TMLedgerData> const&);

    using neededHash_t =
        std::pair <protocol::TMGetObjectByHash::ObjectType, uint256>;
@@ -88,6 +95,10 @@ public:

    void runData ();

+    static
+    LedgerInfo
+    deserializeHeader(Slice data, bool hasPrefix);
+
 private:
    enum class TriggerReason
    {
@@ -105,7 +116,7 @@ private:
    std::vector<neededHash_t> getNeededHashes ();

    void addPeers ();
-    bool tryLocal ();
+    void tryDB (Family& f);

    void done ();

@@ -115,7 +126,7 @@ private:
    {
        // For historical nodes, do not trigger too soon
        // since a fetch pack is probably coming
-        if (mReason != fcHISTORY)
+        if (mReason != Reason::HISTORY)
            trigger (peer, TriggerReason::added);
    }

@@ -146,24 +157,18 @@ private:
    neededStateHashes (
        int max, SHAMapSyncFilter* filter) const;

-    LedgerInfo
-    deserializeHeader (
-        Slice data,
-        bool hasPrefix);
-
-private:
    std::shared_ptr<Ledger> mLedger;
-    bool               mHaveHeader;
-    bool               mHaveState;
-    bool               mHaveTransactions;
-    bool               mSignaled;
-    bool               mByHash;
-    std::uint32_t      mSeq;
-    fcReason           mReason;
+    bool mHaveHeader;
+    bool mHaveState;
+    bool mHaveTransactions;
+    bool mSignaled;
+    bool mByHash;
+    std::uint32_t mSeq;
+    Reason const mReason;

    std::set <uint256> mRecentNodes;

-    SHAMapAddNode      mStats;
+    SHAMapAddNode mStats;

    // Data we have received from peers
    std::mutex mReceivedDataLock;
--- a/src/ripple/app/ledger/InboundLedgers.h
+++ b/src/ripple/app/ledger/InboundLedgers.h
@@ -43,7 +43,7 @@ public:
    virtual
    std::shared_ptr<Ledger const>
    acquire (uint256 const& hash,
-        std::uint32_t seq, InboundLedger::fcReason) = 0;
+        std::uint32_t seq, InboundLedger::Reason) = 0;

    virtual std::shared_ptr<InboundLedger> find (LedgerHash const& hash) = 0;

@@ -72,7 +72,7 @@ public:
    virtual std::size_t fetchRate() = 0;

    /** Called when a complete ledger is obtained. */
-    virtual void onLedgerFetched (InboundLedger::fcReason why) = 0;
+    virtual void onLedgerFetched() = 0;

    virtual void gotFetchPack () = 0;
    virtual void sweep () = 0;
--- a/src/ripple/app/ledger/Ledger.cpp
+++ b/src/ripple/app/ledger/Ledger.cpp
@@ -262,7 +262,7 @@ Ledger::Ledger (
    if (! loaded)
    {
        info_.hash = calculateLedgerHash(info_);
-        family.missing_node (info_.hash);
+        family.missing_node (info_.hash, info_.seq);
    }
 }

@@ -311,9 +311,11 @@ Ledger::Ledger (
        Family& family)
    : mImmutable (true)
    , txMap_ (std::make_shared <SHAMap> (SHAMapType::TRANSACTION,
-        info.txHash, family, SHAMap::version{1}))
+        info.txHash, family,
+        SHAMap::version{getSHAMapV2(info) ? 2 : 1}))
    , stateMap_ (std::make_shared <SHAMap> (SHAMapType::STATE,
-        info.accountHash, family, SHAMap::version{1}))
+        info.accountHash, family,
+        SHAMap::version{getSHAMapV2(info) ? 2 : 1}))
    , rules_{config.features}
    , info_ (info)
 {
@@ -802,8 +804,8 @@ static bool saveValidatedLedger (
    bool current)
 {
    auto j = app.journal ("Ledger");
-
-    if (! app.pendingSaves().startWork (ledger->info().seq))
+    auto seq = ledger->info().seq;
+    if (! app.pendingSaves().startWork (seq))
    {
        // The save was completed synchronously
        JLOG (j.debug()) << "Save aborted";
@@ -813,7 +815,7 @@ static bool saveValidatedLedger (
    // TODO(tom): Fix this hard-coded SQL!
    JLOG (j.trace())
        << "saveValidatedLedger "
-        << (current ? "" : "fromAcquire ") << ledger->info().seq;
+        << (current ? "" : "fromAcquire ") << seq;
    static boost::format deleteLedger (
        "DELETE FROM Ledgers WHERE LedgerSeq = %u;");
    static boost::format deleteTrans1 (
@@ -823,8 +825,6 @@ static bool saveValidatedLedger (
    static boost::format deleteAcctTrans (
        "DELETE FROM AccountTransactions WHERE TransID = '%s';");

-    auto seq = ledger->info().seq;
-
    if (! ledger->info().accountHash.isNonZero ())
    {
        JLOG (j.fatal()) << "AH is zero: "
@@ -848,11 +848,10 @@ static bool saveValidatedLedger (
        Serializer s (128);
        s.add32 (HashPrefix::ledgerMaster);
        addRaw(ledger->info(), s);
-        app.getNodeStore ().store (
-            hotLEDGER, std::move (s.modData ()), ledger->info().hash);
+        app.getNodeStore().store(hotLEDGER,
+            std::move(s.modData()), ledger->info().hash, seq);
    }

-
    AcceptedLedger::pointer aLedger;
    try
    {
--- a/src/ripple/app/ledger/Ledger.h
+++ b/src/ripple/app/ledger/Ledger.h
@@ -261,8 +261,10 @@ public:
    void
    setFull() const
    {
-        txMap_->setLedgerSeq (info_.seq);
-        stateMap_->setLedgerSeq (info_.seq);
+        txMap_->setFull();
+        stateMap_->setFull();
+        txMap_->setLedgerSeq(info_.seq);
+        stateMap_->setLedgerSeq(info_.seq);
    }

    void setTotalDrops (std::uint64_t totDrops)
--- a/src/ripple/app/ledger/LedgerMaster.h
+++ b/src/ripple/app/ledger/LedgerMaster.h
@@ -22,6 +22,7 @@

 #include <ripple/app/main/Application.h>
 #include <ripple/app/ledger/AbstractFetchPackContainer.h>
+#include <ripple/app/ledger/InboundLedgers.h>
 #include <ripple/app/ledger/Ledger.h>
 #include <ripple/app/ledger/LedgerCleaner.h>
 #include <ripple/app/ledger/LedgerHistory.h>
@@ -180,7 +181,7 @@ public:
        LedgerIndex ledgerIndex);

    boost::optional <NetClock::time_point> getCloseTimeByHash (
-        LedgerHash const& ledgerHash);
+        LedgerHash const& ledgerHash, LedgerIndex ledgerIndex);

    void addHeldTransaction (std::shared_ptr<Transaction> const& trans);
    void fixMismatch (ReadView const& ledger);
@@ -255,14 +256,21 @@ private:
        Job& job,
        std::shared_ptr<Ledger const> ledger);

-    void getFetchPack(LedgerHash missingHash, LedgerIndex missingIndex);
-    boost::optional<LedgerHash> getLedgerHashForHistory(LedgerIndex index);
+    void getFetchPack(
+        LedgerIndex missingIndex, InboundLedger::Reason reason);
+
+    boost::optional<LedgerHash> getLedgerHashForHistory(
+        LedgerIndex index, InboundLedger::Reason reason);
+
    std::size_t getNeededValidations();
    void advanceThread();
+    void fetchForHistory(
+        std::uint32_t missing,
+        bool& progress,
+        InboundLedger::Reason reason);
    // Try to publish ledgers, acquire missing ledgers.  Always called with
    // m_mutex locked.  The passed ScopedLockType is a reminder to callers.
    void doAdvance(ScopedLockType&);
-    bool shouldFetchPack(std::uint32_t seq) const;
    bool shouldAcquire(
        std::uint32_t const currentLedger,
        std::uint32_t const ledgerHistory,
@@ -299,6 +307,9 @@ private:
    // The last ledger we handled fetching history
    std::shared_ptr<Ledger const> mHistLedger;

+    // The last ledger we handled fetching for a shard
+    std::shared_ptr<Ledger const> mShardLedger;
+
    // Fully validated ledger, whether or not we have the ledger resident.
    std::pair <uint256, LedgerIndex> mLastValidLedger {uint256(), 0};

@@ -342,7 +353,7 @@ private:
    // How much history do we want to keep
    std::uint32_t const ledger_history_;

-    int const ledger_fetch_size_;
+    std::uint32_t const ledger_fetch_size_;

    TaggedCache<uint256, Blob> fetch_packs_;

--- a/src/ripple/app/ledger/TransactionStateSF.cpp
+++ b/src/ripple/app/ledger/TransactionStateSF.cpp
@@ -22,22 +22,14 @@

 namespace ripple {

-TransactionStateSF::TransactionStateSF(Family& f,
-    AbstractFetchPackContainer& fp)
-        : f_(f)
-        , fp_(fp)
-{
-}
+void
+TransactionStateSF::gotNode(bool, SHAMapHash const& nodeHash,
+    std::uint32_t ledgerSeq, Blob&& nodeData, SHAMapTreeNode::TNType type) const

-void TransactionStateSF::gotNode (bool fromFilter,
-                                  SHAMapHash const& nodeHash,
-                                  Blob&& nodeData,
-                                  SHAMapTreeNode::TNType type) const
 {
-    assert(type !=
-        SHAMapTreeNode::tnTRANSACTION_NM);
-    f_.db().store(hotTRANSACTION_NODE,
-        std::move (nodeData), nodeHash.as_uint256());
+    assert(type != SHAMapTreeNode::tnTRANSACTION_NM);
+    db_.store(hotTRANSACTION_NODE, std::move(nodeData),
+        nodeHash.as_uint256(), ledgerSeq);
 }

 boost::optional<Blob>
--- a/src/ripple/app/ledger/TransactionStateSF.h
+++ b/src/ripple/app/ledger/TransactionStateSF.h
@@ -21,33 +21,32 @@
 #define RIPPLE_APP_LEDGER_TRANSACTIONSTATESF_H_INCLUDED

 #include <ripple/app/ledger/AbstractFetchPackContainer.h>
+#include <ripple/nodestore/Database.h>
 #include <ripple/shamap/SHAMapSyncFilter.h>
-#include <ripple/shamap/Family.h>
-#include <cstdint>

 namespace ripple {

 // This class is only needed on add functions
 // sync filter for transactions tree during ledger sync
-class TransactionStateSF
-    : public SHAMapSyncFilter
+class TransactionStateSF : public SHAMapSyncFilter
 {
-private:
-    Family& f_;
-    AbstractFetchPackContainer& fp_;
-
 public:
-    explicit
-    TransactionStateSF(Family&, AbstractFetchPackContainer&);
+    TransactionStateSF(NodeStore::Database& db, AbstractFetchPackContainer& fp)
+        : db_(db)
+        , fp_(fp)
+    {}

-    // Note that the nodeData is overwritten by this call
-    void gotNode (bool fromFilter,
-                  SHAMapHash const& nodeHash,
-                  Blob&& nodeData,
-                  SHAMapTreeNode::TNType) const override;
+    void
+    gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+        std::uint32_t ledgerSeq, Blob&& nodeData,
+            SHAMapTreeNode::TNType type) const override;

    boost::optional<Blob>
    getNode(SHAMapHash const& nodeHash) const override;
+
+private:
+    NodeStore::Database& db_;
+    AbstractFetchPackContainer& fp_;
 };

 } // ripple
--- a/src/ripple/app/ledger/impl/InboundLedger.cpp
+++ b/src/ripple/app/ledger/impl/InboundLedger.cpp
@@ -18,9 +18,9 @@
 //==============================================================================

 #include <BeastConfig.h>
+#include <ripple/app/ledger/InboundLedger.h>
 #include <ripple/shamap/SHAMapNodeID.h>
 #include <ripple/app/ledger/AccountStateSF.h>
-#include <ripple/app/ledger/InboundLedger.h>
 #include <ripple/app/ledger/InboundLedgers.h>
 #include <ripple/app/ledger/LedgerMaster.h>
 #include <ripple/app/ledger/TransactionStateSF.h>
@@ -32,7 +32,8 @@
 #include <ripple/resource/Fees.h>
 #include <ripple/protocol/HashPrefix.h>
 #include <ripple/protocol/JsonFields.h>
-#include <ripple/nodestore/Database.h>
+#include <ripple/nodestore/DatabaseShard.h>
+
 #include <algorithm>

 namespace ripple {
@@ -66,8 +67,8 @@ enum
 // millisecond for each ledger timeout
 auto constexpr ledgerAcquireTimeout = 2500ms;

-InboundLedger::InboundLedger (
-    Application& app, uint256 const& hash, std::uint32_t seq, fcReason reason, clock_type& clock)
+InboundLedger::InboundLedger(Application& app, uint256 const& hash,
+    std::uint32_t seq, Reason reason, clock_type& clock)
    : PeerSet (app, hash, ledgerAcquireTimeout, clock,
        app.journal("InboundLedger"))
    , mHaveHeader (false)
@@ -79,37 +80,69 @@ InboundLedger::InboundLedger (
    , mReason (reason)
    , mReceiveDispatched (false)
 {
-    JLOG (m_journal.trace()) <<
-        "Acquiring ledger " << mHash;
+    JLOG (m_journal.trace()) << "Acquiring ledger " << mHash;
 }

-void InboundLedger::init (ScopedLockType& collectionLock)
+void
+InboundLedger::init(ScopedLockType& collectionLock)
 {
    ScopedLockType sl (mLock);
-    collectionLock.unlock ();
-
-    if (!tryLocal ())
+    collectionLock.unlock();
+    tryDB(app_.family());
+    if (mFailed)
+        return;
+    if (! mComplete)
    {
-        addPeers ();
-        setTimer ();
-    }
-    else if (!isFailed ())
-    {
-        JLOG (m_journal.debug()) <<
-            "Acquiring ledger we already have locally: " << getHash ();
-        mLedger->setImmutable (app_.config());
-
-        if (mReason != fcHISTORY)
-            app_.getLedgerMaster ().storeLedger (mLedger);
-
-        // Check if this could be a newer fully-validated ledger
-        if (mReason == fcVALIDATION ||
-            mReason == fcCURRENT ||
-            mReason == fcCONSENSUS)
+        auto shardStore = app_.getShardStore();
+        if (mReason == Reason::SHARD)
        {
-            app_.getLedgerMaster ().checkAccept (mLedger);
+            if (! shardStore || ! app_.shardFamily())
+            {
+                JLOG(m_journal.error()) <<
+                    "Acquiring shard with no shard store available";
+                mFailed = true;
+                return;
+            }
+            mHaveHeader = false;
+            mHaveTransactions = false;
+            mHaveState = false;
+            mLedger.reset();
+            tryDB(*app_.shardFamily());
+            if (mFailed)
+                return;
+        }
+        else if (shardStore && mSeq >= NodeStore::genesisSeq)
+        {
+            if (auto l = shardStore->fetchLedger(mHash, mSeq))
+            {
+                mHaveHeader = true;
+                mHaveTransactions = true;
+                mHaveState = true;
+                mComplete = true;
+                mLedger = std::move(l);
+            }
        }
    }
+    if (! mComplete)
+    {
+        addPeers();
+        execute();
+        return;
+    }
+
+    JLOG (m_journal.debug()) <<
+        "Acquiring ledger we already have in " <<
+        " local store. " << mHash;
+    mLedger->setImmutable(app_.config());
+
+    if (mReason == Reason::HISTORY || mReason == Reason::SHARD)
+        return;
+
+    app_.getLedgerMaster().storeLedger(mLedger);
+
+    // Check if this could be a newer fully-validated ledger
+    if (mReason == Reason::CONSENSUS)
+        app_.getLedgerMaster().checkAccept(mLedger);
 }

 void InboundLedger::execute ()
@@ -144,11 +177,19 @@ void InboundLedger::update (std::uint32_t seq)
 bool InboundLedger::checkLocal ()
 {
    ScopedLockType sl (mLock);
-
-    if (!isDone () && tryLocal())
+    if (! isDone())
    {
-        done();
-        return true;
+        if (mLedger)
+            tryDB(mLedger->stateMap().family());
+        else if(mReason == Reason::SHARD)
+            tryDB(*app_.shardFamily());
+        else
+            tryDB(app_.family());
+        if (mFailed || mComplete)
+        {
+            done();
+            return true;
+        }
    }
    return false;
 }
@@ -232,75 +273,79 @@ InboundLedger::deserializeHeader (
    return info;
 }

-/** See how much of the ledger data, if any, is
-    in our node store
-*/
-bool InboundLedger::tryLocal ()
+// See how much of the ledger data is stored locally
+// Data found in a fetch pack will be stored
+void
+InboundLedger::tryDB(Family& f)
 {
-    // return value: true = no more work to do
-
-    if (!mHaveHeader)
+    if (! mHaveHeader)
    {
-        // Nothing we can do without the ledger header
-        auto node = app_.getNodeStore ().fetch (mHash);
+        auto makeLedger = [&, this](Blob const& data)
+            {
+                JLOG(m_journal.trace()) <<
+                    "Ledger header found in fetch pack";
+                mLedger = std::make_shared<Ledger>(
+                    deserializeHeader(makeSlice(data), true),
+                        app_.config(), f);
+                if (mLedger->info().hash != mHash ||
+                    (mSeq != 0 && mSeq != mLedger->info().seq))
+                {
+                    // We know for a fact the ledger can never be acquired
+                    JLOG(m_journal.warn()) <<
+                        "hash " << mHash <<
+                        " seq " << std::to_string(mSeq) <<
+                        " cannot be a ledger";
+                    mLedger.reset();
+                    mFailed = true;
+                }
+            };

-        if (!node)
+        // Try to fetch the ledger header from the DB
+        auto node = f.db().fetch(mHash, mSeq);
+        if (! node)
        {
            auto data = app_.getLedgerMaster().getFetchPack(mHash);
            if (! data)
-                return false;
-
+                return;
            JLOG (m_journal.trace()) <<
                "Ledger header found in fetch pack";
-
-            mLedger = std::make_shared<Ledger> (
-                deserializeHeader (makeSlice(*data), true),
-                app_.config(),
-                app_.family());
-
-            app_.getNodeStore ().store (
-                hotLEDGER, std::move (*data), mHash);
+            makeLedger(*data);
+            if (mLedger)
+                f.db().store(hotLEDGER, std::move(*data),
+                    mHash, mLedger->info().seq);
        }
        else
        {
-            mLedger = std::make_shared<Ledger>(
-                deserializeHeader (makeSlice (node->getData()), true),
-                app_.config(),
-                app_.family());
+            JLOG (m_journal.trace()) <<
+                "Ledger header found in node store";
+            makeLedger(node->getData());
        }
-
-        if (mLedger->info().hash != mHash)
-        {
-            // We know for a fact the ledger can never be acquired
-            JLOG (m_journal.warn()) <<
-                mHash << " cannot be a ledger";
-            mFailed = true;
-            return true;
-        }
-
+        if (mFailed)
+            return;
+        if (mSeq == 0)
+            mSeq = mLedger->info().seq;
+        mLedger->stateMap().setLedgerSeq(mSeq);
+        mLedger->txMap().setLedgerSeq(mSeq);
        mHaveHeader = true;
    }

-    if (!mHaveTransactions)
+    if (! mHaveTransactions)
    {
-        if (mLedger->info().txHash.isZero ())
+        if (mLedger->info().txHash.isZero())
        {
-            JLOG (m_journal.trace()) <<
-                "No TXNs to fetch";
+            JLOG (m_journal.trace()) << "No TXNs to fetch";
            mHaveTransactions = true;
        }
        else
        {
-            TransactionStateSF filter(mLedger->txMap().family(),
+            TransactionStateSF filter(mLedger->txMap().family().db(),
                app_.getLedgerMaster());
-
-            if (mLedger->txMap().fetchRoot (
+            if (mLedger->txMap().fetchRoot(
                SHAMapHash{mLedger->info().txHash}, &filter))
            {
-                auto h = neededTxHashes (1, &filter);
-                if (h.empty ())
+                if (neededTxHashes(1, &filter).empty())
                {
-                    JLOG (m_journal.trace()) <<
+                    JLOG(m_journal.trace()) <<
                        "Had full txn map locally";
                    mHaveTransactions = true;
                }
@@ -308,26 +353,23 @@ bool InboundLedger::tryLocal ()
        }
    }

-    if (!mHaveState)
+    if (! mHaveState)
    {
-        if (mLedger->info().accountHash.isZero ())
+        if (mLedger->info().accountHash.isZero())
        {
            JLOG (m_journal.fatal()) <<
                "We are acquiring a ledger with a zero account hash";
            mFailed = true;
-            return true;
+            return;
        }
-
-        AccountStateSF filter(mLedger->stateMap().family(),
+        AccountStateSF filter(mLedger->stateMap().family().db(),
            app_.getLedgerMaster());
-
-        if (mLedger->stateMap().fetchRoot (
+        if (mLedger->stateMap().fetchRoot(
            SHAMapHash{mLedger->info().accountHash}, &filter))
        {
-            auto h = neededStateHashes (1, &filter);
-            if (h.empty ())
+            if (neededStateHashes(1, &filter).empty())
            {
-                JLOG (m_journal.trace()) <<
+                JLOG(m_journal.trace()) <<
                    "Had full AS map locally";
                mHaveState = true;
            }
@@ -336,13 +378,11 @@ bool InboundLedger::tryLocal ()

    if (mHaveTransactions && mHaveState)
    {
-        JLOG (m_journal.debug()) <<
+        JLOG(m_journal.debug()) <<
            "Had everything locally";
        mComplete = true;
-        mLedger->setImmutable (app_.config());
+        mLedger->setImmutable(app_.config());
    }
-
-    return mComplete;
 }

 /** Called with a lock by the PeerSet when the timer expires
@@ -386,14 +426,14 @@ void InboundLedger::onTimer (bool wasProgress, ScopedLockType&)
            "No progress(" << pc <<
            ") for ledger " << mHash;

-        // addPeers triggers if the reason is not fcHISTORY
-        // So if the reason IS fcHISTORY, need to trigger after we add
+        // addPeers triggers if the reason is not HISTORY
+        // So if the reason IS HISTORY, need to trigger after we add
        // otherwise, we need to trigger before we add
        // so each peer gets triggered once
-        if (mReason != fcHISTORY)
+        if (mReason != Reason::HISTORY)
            trigger (nullptr, TriggerReason::timeout);
        addPeers ();
-        if (mReason == fcHISTORY)
+        if (mReason == Reason::HISTORY)
            trigger (nullptr, TriggerReason::timeout);
    }
 }
@@ -421,20 +461,29 @@ void InboundLedger::done ()

    JLOG (m_journal.debug()) <<
        "Acquire " << mHash <<
-        (isFailed () ? " fail " : " ") <<
+        (mFailed ? " fail " : " ") <<
        ((getTimeouts () == 0) ? std::string() :
            (std::string ("timeouts:") +
            to_string (getTimeouts ()) + " ")) <<
        mStats.get ();

-    assert (isComplete () || isFailed ());
+    assert (mComplete || mFailed);

-    if (isComplete () && !isFailed () && mLedger)
+    if (mComplete && ! mFailed && mLedger)
    {
        mLedger->setImmutable (app_.config());
-        if (mReason != fcHISTORY)
-            app_.getLedgerMaster ().storeLedger (mLedger);
-        app_.getInboundLedgers().onLedgerFetched(mReason);
+        switch (mReason)
+        {
+        case Reason::SHARD:
+            app_.getShardStore()->setStored(mLedger);
+            // TODO c++17: [[fallthrough]]
+        case Reason::HISTORY:
+            app_.getInboundLedgers().onLedgerFetched();
+            break;
+        default:
+            app_.getLedgerMaster().storeLedger(mLedger);
+            break;
+        }
    }

    // We hold the PeerSet lock, so must dispatch
@@ -442,7 +491,7 @@ void InboundLedger::done ()
        jtLEDGER_DATA, "AcquisitionDone",
        [self = shared_from_this()](Job&)
        {
-            if (self->isComplete() && !self->isFailed())
+            if (self->mComplete && !self->mFailed)
            {
                self->app().getLedgerMaster().checkAccept(
                    self->getLedger());
@@ -487,10 +536,10 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
                    " as=" << mHaveState;
    }

-    if (!mHaveHeader)
+    if (! mHaveHeader)
    {
-        tryLocal ();
-
+        tryDB(mReason == Reason::SHARD ?
+            *app_.shardFamily() : app_.family());
        if (mFailed)
        {
            JLOG (m_journal.warn()) <<
@@ -506,17 +555,17 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
    { // Be more aggressive if we've timed out at least once
        tmGL.set_querytype (protocol::qtINDIRECT);

-        if (!isProgress () && !mFailed && mByHash && (
-            getTimeouts () > ledgerBecomeAggressiveThreshold))
+        if (! isProgress () && ! mFailed && mByHash &&
+            (getTimeouts () > ledgerBecomeAggressiveThreshold))
        {
            auto need = getNeededHashes ();

            if (!need.empty ())
            {
                protocol::TMGetObjectByHash tmBH;
+                bool typeSet = false;
                tmBH.set_query (true);
                tmBH.set_ledgerhash (mHash.begin (), mHash.size ());
-                bool typeSet = false;
                for (auto const& p : need)
                {
                    JLOG (m_journal.warn()) <<
@@ -532,6 +581,8 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
                    {
                        protocol::TMIndexedObject* io = tmBH.add_objects ();
                        io->set_hash (p.second.begin (), p.second.size ());
+                        if (mSeq != 0)
+                            io->set_ledgerseq(mSeq);
                    }
                }

@@ -564,6 +615,8 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
    if (!mHaveHeader && !mFailed)
    {
        tmGL.set_itype (protocol::liBASE);
+        if (mSeq != 0)
+            tmGL.set_ledgerseq (mSeq);
        JLOG (m_journal.trace()) <<
            "Sending header request to " <<
             (peer ? "selected peer" : "all peers");
@@ -610,7 +663,7 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
        }
        else
        {
-            AccountStateSF filter(mLedger->stateMap().family(),
+            AccountStateSF filter(mLedger->stateMap().family().db(),
                app_.getLedgerMaster());

            // Release the lock while we process the large state map
@@ -684,7 +737,7 @@ void InboundLedger::trigger (std::shared_ptr<Peer> const& peer, TriggerReason re
        }
        else
        {
-            TransactionStateSF filter(mLedger->txMap().family(),
+            TransactionStateSF filter(mLedger->txMap().family().db(),
                app_.getLedgerMaster());

            auto nodes = mLedger->txMap().getMissingNodes (
@@ -799,12 +852,12 @@ bool InboundLedger::takeHeader (std::string const& data)
    if (mComplete || mFailed || mHaveHeader)
        return true;

-    mLedger = std::make_shared<Ledger>(
-        deserializeHeader (makeSlice(data), false),
-        app_.config(),
-        app_.family());
-
-    if (mLedger->info().hash != mHash)
+    auto* f = mReason == Reason::SHARD ?
+        app_.shardFamily() : &app_.family();
+    mLedger = std::make_shared<Ledger>(deserializeHeader(
+        makeSlice(data), false), app_.config(), *f);
+    if (mLedger->info().hash != mHash ||
+        (mSeq != 0 && mSeq != mLedger->info().seq))
    {
        JLOG (m_journal.warn()) <<
            "Acquire hash mismatch: " << mLedger->info().hash <<
@@ -812,14 +865,16 @@ bool InboundLedger::takeHeader (std::string const& data)
        mLedger.reset ();
        return false;
    }
-
+    if (mSeq == 0)
+        mSeq = mLedger->info().seq;
+    mLedger->stateMap().setLedgerSeq(mSeq);
+    mLedger->txMap().setLedgerSeq(mSeq);
    mHaveHeader = true;

    Serializer s (data.size () + 4);
    s.add32 (HashPrefix::ledgerMaster);
    s.addRaw (data.data(), data.size());
-    app_.getNodeStore ().store (
-        hotLEDGER, std::move (s.modData ()), mHash);
+    f->db().store(hotLEDGER, std::move (s.modData ()), mHash, mSeq);

    if (mLedger->info().txHash.isZero ())
        mHaveTransactions = true;
@@ -855,7 +910,7 @@ bool InboundLedger::takeTxNode (const std::vector<SHAMapNodeID>& nodeIDs,

    auto nodeIDit = nodeIDs.cbegin ();
    auto nodeDatait = data.begin ();
-    TransactionStateSF filter(mLedger->txMap().family(),
+    TransactionStateSF filter(mLedger->txMap().family().db(),
        app_.getLedgerMaster());

    while (nodeIDit != nodeIDs.cend ())
@@ -927,7 +982,7 @@ bool InboundLedger::takeAsNode (const std::vector<SHAMapNodeID>& nodeIDs,

    auto nodeIDit = nodeIDs.cbegin ();
    auto nodeDatait = data.begin ();
-    AccountStateSF filter(mLedger->stateMap().family(),
+    AccountStateSF filter(mLedger->stateMap().family().db(),
        app_.getLedgerMaster());

    while (nodeIDit != nodeIDs.cend ())
@@ -991,7 +1046,7 @@ bool InboundLedger::takeAsRootNode (Slice const& data, SHAMapAddNode& san)
        return false;
    }

-    AccountStateSF filter(mLedger->stateMap().family(),
+    AccountStateSF filter(mLedger->stateMap().family().db(),
        app_.getLedgerMaster());
    san += mLedger->stateMap().addRootNode (
        SHAMapHash{mLedger->info().accountHash}, data, snfWIRE, &filter);
@@ -1015,7 +1070,7 @@ bool InboundLedger::takeTxRootNode (Slice const& data, SHAMapAddNode& san)
        return false;
    }

-    TransactionStateSF filter(mLedger->txMap().family(),
+    TransactionStateSF filter(mLedger->txMap().family().db(),
        app_.getLedgerMaster());
    san += mLedger->txMap().addRootNode (
        SHAMapHash{mLedger->info().txHash}, data, snfWIRE, &filter);
@@ -1036,7 +1091,7 @@ InboundLedger::getNeededHashes ()

    if (!mHaveState)
    {
-        AccountStateSF filter(mLedger->stateMap().family(),
+        AccountStateSF filter(mLedger->stateMap().family().db(),
            app_.getLedgerMaster());
        for (auto const& h : neededStateHashes (4, &filter))
        {
@@ -1047,7 +1102,7 @@ InboundLedger::getNeededHashes ()

    if (!mHaveTransactions)
    {
-        TransactionStateSF filter(mLedger->txMap().family(),
+        TransactionStateSF filter(mLedger->txMap().family().db(),
            app_.getLedgerMaster());
        for (auto const& h : neededTxHashes (4, &filter))
        {
@@ -1062,9 +1117,9 @@ InboundLedger::getNeededHashes ()
 /** Stash a TMLedgerData received from a peer for later processing
    Returns 'true' if we need to dispatch
 */
-// VFALCO TODO Why isn't the shared_ptr passed by const& ?
-bool InboundLedger::gotData (std::weak_ptr<Peer> peer,
-    std::shared_ptr<protocol::TMLedgerData> data)
+bool
+InboundLedger::gotData(std::weak_ptr<Peer> peer,
+    std::shared_ptr<protocol::TMLedgerData> const& data)
 {
    std::lock_guard<std::mutex> sl (mReceivedDataLock);

--- a/src/ripple/app/ledger/impl/InboundLedgers.cpp
+++ b/src/ripple/app/ledger/impl/InboundLedgers.cpp
@@ -25,6 +25,7 @@
 #include <ripple/basics/DecayingSample.h>
 #include <ripple/basics/Log.h>
 #include <ripple/core/JobQueue.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/protocol/JsonFields.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/beast/container/aged_map.h>
@@ -65,41 +66,64 @@ public:
    }

    std::shared_ptr<Ledger const>
-    acquire (
-        uint256 const& hash,
-        std::uint32_t seq,
-        InboundLedger::fcReason reason)
+    acquire(uint256 const& hash, std::uint32_t seq,
+        InboundLedger::Reason reason)
    {
-        assert (hash.isNonZero ());
+        assert(hash.isNonZero());
+        assert(reason != InboundLedger::Reason::SHARD ||
+            (seq != 0 && app_.getShardStore()));
+        if (isStopping())
+            return {};
+
        bool isNew = true;
        std::shared_ptr<InboundLedger> inbound;
        {
-            ScopedLockType sl (mLock);
-
-            if (! isStopping ())
+            ScopedLockType sl(mLock);
+            auto it = mLedgers.find(hash);
+            if (it != mLedgers.end())
            {
-                auto it = mLedgers.find (hash);
-                if (it != mLedgers.end ())
-                {
-                    isNew = false;
-                    inbound = it->second;
-                }
-                else
-                {
-                    inbound = std::make_shared <InboundLedger> (app_,
-                        hash, seq, reason, std::ref (m_clock));
-                    mLedgers.emplace (hash, inbound);
-                    inbound->init (sl);
-                    ++mCounter;
-                }
+                isNew = false;
+                inbound = it->second;
+            }
+            else
+            {
+                inbound = std::make_shared <InboundLedger>(
+                    app_, hash, seq, reason, std::ref(m_clock));
+                mLedgers.emplace(hash, inbound);
+                inbound->init(sl);
+                ++mCounter;
            }
        }
-        if (inbound && ! isNew && ! inbound->isFailed ())
-            inbound->update (seq);

-        if (inbound && inbound->isComplete ())
-            return inbound->getLedger();
-        return {};
+        if (inbound->isFailed())
+            return {};
+
+        if (! isNew)
+            inbound->update(seq);
+
+        if (! inbound->isComplete())
+            return {};
+
+        if (reason == InboundLedger::Reason::HISTORY)
+        {
+            if (inbound->getLedger()->stateMap().family().isShardBacked())
+                app_.getNodeStore().copyLedger(inbound->getLedger());
+        }
+        else if (reason == InboundLedger::Reason::SHARD)
+        {
+            auto shardStore = app_.getShardStore();
+            if (!shardStore)
+            {
+                JLOG(j_.error()) <<
+                    "Acquiring shard with no shard store available";
+                return {};
+            }
+            if (inbound->getLedger()->stateMap().family().isShardBacked())
+                shardStore->setStored(inbound->getLedger());
+            else
+                shardStore->copyLedger(inbound->getLedger());
+        }
+        return inbound->getLedger();
    }

    std::shared_ptr<InboundLedger> find (uint256 const& hash)
@@ -280,13 +304,11 @@ public:
            m_clock.now());
    }

-    void onLedgerFetched (
-        InboundLedger::fcReason why)
+    // Should only be called with an inboundledger that has
+    // a reason of history or shard
+    void onLedgerFetched()
    {
-        if (why != InboundLedger::fcHISTORY)
-            return;
-        std::lock_guard<
-            std::mutex> lock(fetchRateMutex_);
+        std::lock_guard<std::mutex> lock(fetchRateMutex_);
        fetchRate_.add(1, m_clock.now());
    }

--- a/src/ripple/app/ledger/impl/LedgerCleaner.cpp
+++ b/src/ripple/app/ledger/impl/LedgerCleaner.cpp
@@ -284,7 +284,7 @@ private:
                "Node missing from ledger " << ledger->info().seq;
            app_.getInboundLedgers().acquire (
                ledger->info().hash, ledger->info().seq,
-                InboundLedger::fcGENERIC);
+                InboundLedger::Reason::GENERIC);
        }
        return hash ? *hash : zero; // kludge
    }
@@ -303,13 +303,13 @@ private:
        bool doTxns)
    {
        auto nodeLedger = app_.getInboundLedgers().acquire (
-            ledgerHash, ledgerIndex, InboundLedger::fcGENERIC);
+            ledgerHash, ledgerIndex, InboundLedger::Reason::GENERIC);
        if (!nodeLedger)
        {
            JLOG (j_.debug()) << "Ledger " << ledgerIndex << " not available";
            app_.getLedgerMaster().clearLedger (ledgerIndex);
            app_.getInboundLedgers().acquire(
-                ledgerHash, ledgerIndex, InboundLedger::fcGENERIC);
+                ledgerHash, ledgerIndex, InboundLedger::Reason::GENERIC);
            return false;
        }

@@ -336,7 +336,7 @@ private:
            JLOG (j_.debug()) << "Ledger " << ledgerIndex << " is missing nodes";
            app_.getLedgerMaster().clearLedger (ledgerIndex);
            app_.getInboundLedgers().acquire(
-                ledgerHash, ledgerIndex, InboundLedger::fcGENERIC);
+                ledgerHash, ledgerIndex, InboundLedger::Reason::GENERIC);
            return false;
        }

@@ -390,7 +390,7 @@ private:
                    // ledger.
                    referenceLedger =
                        app_.getInboundLedgers().acquire(
-                            refHash, refIndex, InboundLedger::fcGENERIC);
+                            refHash, refIndex, InboundLedger::Reason::GENERIC);
                    if (referenceLedger)
                        ledgerHash = getLedgerHash(
                            referenceLedger, ledgerIndex);
--- a/src/ripple/app/ledger/impl/LedgerMaster.cpp
+++ b/src/ripple/app/ledger/impl/LedgerMaster.cpp
@@ -19,7 +19,6 @@

 #include <BeastConfig.h>
 #include <ripple/app/ledger/LedgerMaster.h>
-#include <ripple/app/ledger/InboundLedgers.h>
 #include <ripple/app/ledger/OpenLedger.h>
 #include <ripple/app/ledger/OrderBookDB.h>
 #include <ripple/app/ledger/PendingSaves.h>
@@ -40,6 +39,7 @@
 #include <ripple/basics/TaggedCache.h>
 #include <ripple/basics/UptimeTimer.h>
 #include <ripple/core/TimeKeeper.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/overlay/Overlay.h>
 #include <ripple/overlay/Peer.h>
 #include <ripple/protocol/digest.h>
@@ -499,13 +499,16 @@ LedgerMaster::tryFill (
 /** Request a fetch pack to get to the specified ledger
 */
 void
-LedgerMaster::getFetchPack (LedgerHash missingHash, LedgerIndex missingIndex)
+LedgerMaster::getFetchPack (LedgerIndex missingIndex,
+    InboundLedger::Reason reason)
 {
-    auto haveHash = getLedgerHashForHistory (missingIndex + 1);
-
+    auto haveHash = getLedgerHashForHistory(
+        missingIndex + 1, reason);
    if (!haveHash)
    {
-        JLOG (m_journal.error()) << "No hash for fetch pack";
+        JLOG (m_journal.error()) <<
+            "No hash for fetch pack. Missing Index " <<
+            std::to_string(missingIndex);
        return;
    }
    assert(haveHash->isNonZero());
@@ -668,7 +671,7 @@ LedgerMaster::failedSave(std::uint32_t seq, uint256 const& hash)
 {
    clearLedger(seq);
    app_.getInboundLedgers().acquire(
-        hash, seq, InboundLedger::fcGENERIC);
+        hash, seq, InboundLedger::Reason::GENERIC);
 }

 // Check if the specified ledger can become the new last fully-validated
@@ -716,7 +719,7 @@ LedgerMaster::checkAccept (uint256 const& hash, std::uint32_t seq)
        // FIXME: We may not want to fetch a ledger with just one
        // trusted validation
        ledger = app_.getInboundLedgers().acquire(
-            hash, 0, InboundLedger::fcGENERIC);
+            hash, seq, InboundLedger::Reason::GENERIC);
    }

    if (ledger)
@@ -922,16 +925,19 @@ LedgerMaster::advanceThread()
 }

 boost::optional<LedgerHash>
-LedgerMaster::getLedgerHashForHistory (LedgerIndex index)
+LedgerMaster::getLedgerHashForHistory(
+    LedgerIndex index, InboundLedger::Reason reason)
 {
    // Try to get the hash of a ledger we need to fetch for history
    boost::optional<LedgerHash> ret;
+    auto const& l {reason == InboundLedger::Reason::SHARD ?
+        mShardLedger : mHistLedger};

-    if (mHistLedger && (mHistLedger->info().seq >= index))
+    if (l && l->info().seq >= index)
    {
-        ret = hashOfSeq(*mHistLedger, index, m_journal);
+        ret = hashOfSeq(*l, index, m_journal);
        if (! ret)
-            ret = walkHashBySeq (index, mHistLedger);
+            ret = walkHashBySeq (index, l);
    }

    if (! ret)
@@ -940,12 +946,6 @@ LedgerMaster::getLedgerHashForHistory (LedgerIndex index)
    return ret;
 }

-bool
-LedgerMaster::shouldFetchPack (std::uint32_t seq) const
-{
-    return (fetch_seq_ != seq);
-}
-
 std::vector<std::shared_ptr<Ledger const>>
 LedgerMaster::findNewLedgersToPublish ()
 {
@@ -1030,7 +1030,7 @@ LedgerMaster::findNewLedgersToPublish ()
            // Can we try to acquire the ledger we need?
            if (! ledger && (++acqCount < ledger_fetch_size_))
                ledger = app_.getInboundLedgers ().acquire(
-                    *hash, seq, InboundLedger::fcGENERIC);
+                    *hash, seq, InboundLedger::Reason::GENERIC);

            // Did we acquire the next ledger we need to publish?
            if (ledger && (ledger->info().seq == pubSeq))
@@ -1171,7 +1171,7 @@ LedgerMaster::updatePaths (Job& job)
                app_.getInboundLedgers().acquire(
                    lastLedger->info().parentHash,
                    lastLedger->info().seq - 1,
-                    InboundLedger::fcGENERIC);
+                    InboundLedger::Reason::GENERIC);
            }
            else
            {
@@ -1179,7 +1179,7 @@ LedgerMaster::updatePaths (Job& job)
                app_.getInboundLedgers().acquire(
                    lastLedger->info().hash,
                    lastLedger->info().seq,
-                    InboundLedger::fcGENERIC);
+                    InboundLedger::Reason::GENERIC);
            }
        }
    }
@@ -1278,13 +1278,15 @@ boost::optional <NetClock::time_point>
 LedgerMaster::getCloseTimeBySeq (LedgerIndex ledgerIndex)
 {
    uint256 hash = getHashBySeq (ledgerIndex);
-    return hash.isNonZero() ? getCloseTimeByHash (hash) : boost::none;
+    return hash.isNonZero() ? getCloseTimeByHash(
+        hash, ledgerIndex) : boost::none;
 }

 boost::optional <NetClock::time_point>
-LedgerMaster::getCloseTimeByHash (LedgerHash const& ledgerHash)
+LedgerMaster::getCloseTimeByHash(LedgerHash const& ledgerHash,
+    std::uint32_t index)
 {
-    auto node = app_.getNodeStore().fetch (ledgerHash);
+    auto node = app_.getNodeStore().fetch(ledgerHash, index);
    if (node &&
        (node->getData().size() >= 120))
    {
@@ -1365,7 +1367,7 @@ LedgerMaster::walkHashBySeq (
        if (!ledger)
        {
            auto const ledger = app_.getInboundLedgers().acquire (
-                *refHash, refIndex, InboundLedger::fcGENERIC);
+                *refHash, refIndex, InboundLedger::Reason::GENERIC);
            if (ledger)
            {
                ledgerHash = hashOfSeq(*ledger, index, m_journal);
@@ -1514,11 +1516,129 @@ LedgerMaster::shouldAcquire (
    return ret;
 }

+void
+LedgerMaster::fetchForHistory(
+    std::uint32_t missing,
+    bool& progress,
+    InboundLedger::Reason reason)
+{
+    ScopedUnlockType sl(m_mutex);
+    if (auto hash = getLedgerHashForHistory(missing, reason))
+    {
+        assert(hash->isNonZero());
+        auto ledger = getLedgerByHash(*hash);
+        if (! ledger)
+        {
+            if (!app_.getInboundLedgers().isFailure(*hash))
+            {
+                ledger = app_.getInboundLedgers().acquire(
+                    *hash, missing, reason);
+                if (!ledger &&
+                    missing > NodeStore::genesisSeq &&
+                    missing != fetch_seq_)
+                {
+                    JLOG(m_journal.trace())
+                        << "fetchForHistory want fetch pack " << missing;
+                    fetch_seq_ = missing;
+                    getFetchPack(missing, reason);
+                }
+                else
+                    JLOG(m_journal.trace())
+                        << "fetchForHistory no fetch pack for " << missing;
+            }
+            else
+                JLOG(m_journal.debug())
+                    << "fetchForHistory found failed acquire";
+        }
+        if (ledger)
+        {
+            auto seq = ledger->info().seq;
+            assert(seq == missing);
+            JLOG(m_journal.trace()) <<
+                "fetchForHistory acquired " << seq;
+            if (reason == InboundLedger::Reason::SHARD)
+            {
+                ledger->setFull();
+                {
+                    ScopedLockType lock(m_mutex);
+                    mShardLedger = ledger;
+                }
+                if (!ledger->stateMap().family().isShardBacked())
+                    app_.getShardStore()->copyLedger(ledger);
+            }
+            else
+            {
+                setFullLedger(ledger, false, false);
+                int fillInProgress;
+                {
+                    ScopedLockType lock(m_mutex);
+                    mHistLedger = ledger;
+                    fillInProgress = mFillInProgress;
+                }
+                if (fillInProgress == 0 &&
+                    getHashByIndex(seq - 1, app_) == ledger->info().parentHash)
+                {
+                    {
+                        // Previous ledger is in DB
+                        ScopedLockType lock(m_mutex);
+                        mFillInProgress = seq;
+                    }
+                    app_.getJobQueue().addJob(jtADVANCE, "tryFill",
+                        [this, ledger](Job& j) { tryFill(j, ledger); });
+                }
+            }
+            progress = true;
+        }
+        else
+        {
+            std::uint32_t fetchSz;
+            if (reason == InboundLedger::Reason::SHARD)
+                // Do not fetch ledger sequences lower
+                // than the shard's first ledger sequence
+                fetchSz = NodeStore::DatabaseShard::firstSeq(
+                    NodeStore::DatabaseShard::seqToShardIndex(missing));
+            else
+                // Do not fetch ledger sequences lower
+                // than the genesis ledger sequence
+                fetchSz = NodeStore::genesisSeq;
+            fetchSz = missing >= fetchSz ?
+                std::min(ledger_fetch_size_, (missing - fetchSz) + 1) : 0;
+            try
+            {
+                for (std::uint32_t i = 0; i < fetchSz; ++i)
+                {
+                    std::uint32_t seq = missing - i;
+                    if (auto h = getLedgerHashForHistory(seq, reason))
+                    {
+                        assert(h->isNonZero());
+                        app_.getInboundLedgers().acquire(*h, seq, reason);
+                    }
+                }
+            }
+            catch (std::exception const&)
+            {
+                JLOG(m_journal.warn()) << "Threw while prefetching";
+            }
+        }
+    }
+    else
+    {
+        JLOG(m_journal.fatal()) << "Can't find ledger following prevMissing "
+                                << missing;
+        JLOG(m_journal.fatal()) << "Pub:" << mPubLedgerSeq
+                                << " Val:" << mValidLedgerSeq;
+        JLOG(m_journal.fatal()) << "Ledgers: "
+                                << app_.getLedgerMaster().getCompleteLedgers();
+        JLOG(m_journal.fatal()) << "Acquire reason: "
+            << (reason == InboundLedger::Reason::HISTORY ? "HISTORY" : "SHARD");
+        clearLedger(missing + 1);
+        progress = true;
+    }
+}
+
 // Try to publish ledgers, acquire missing ledgers
 void LedgerMaster::doAdvance (ScopedLockType& sl)
 {
-    // TODO NIKB: simplify and unindent this a bit!
-
    do
    {
        mAdvanceWork = false; // If there's work to do, we'll make progress
@@ -1531,147 +1651,53 @@ void LedgerMaster::doAdvance (ScopedLockType& sl)
                (app_.getJobQueue().getJobCount(jtPUBOLDLEDGER) < 10) &&
                (mValidLedgerSeq == mPubLedgerSeq) &&
                (getValidatedLedgerAge() < MAX_LEDGER_AGE_ACQUIRE))
-            { // We are in sync, so can acquire
-                boost::optional<std::uint32_t> maybeMissing;
+            {
+                // We are in sync, so can acquire
+                InboundLedger::Reason reason = InboundLedger::Reason::HISTORY;
+                boost::optional<std::uint32_t> missing;
                {
-                    ScopedLockType sl (mCompleteLock);
-                    maybeMissing =
-                        prevMissing(mCompleteLedgers, mPubLedger->info().seq);
+                    ScopedLockType sl(mCompleteLock);
+                    missing = prevMissing(mCompleteLedgers,
+                        mPubLedger->info().seq, NodeStore::genesisSeq);
                }
-                if (maybeMissing)
+                if (missing)
                {
-                    std::uint32_t missing = *maybeMissing;
-                    JLOG(m_journal.trace())
-                        << "tryAdvance discovered missing " << missing;
-                    if ((missing > 0) &&
+                    JLOG(m_journal.trace()) <<
+                        "tryAdvance discovered missing " << *missing;
+                    if ((mFillInProgress == 0 || *missing > mFillInProgress) &&
                        shouldAcquire(mValidLedgerSeq, ledger_history_,
-                            app_.getSHAMapStore().getCanDelete(), missing) &&
-                            ((mFillInProgress == 0) || (missing > mFillInProgress)))
+                            app_.getSHAMapStore().getCanDelete(), *missing))
                    {
-                        JLOG(m_journal.trace())
-                            << "advanceThread should acquire";
-                        {
-                            ScopedUnlockType sl(m_mutex);
-                            auto hash = getLedgerHashForHistory(missing);
-                            if (hash)
-                            {
-                                assert(hash->isNonZero());
-                                auto ledger = getLedgerByHash(*hash);
-                                if (!ledger)
-                                {
-                                    if (!app_.getInboundLedgers().isFailure(
-                                        *hash))
-                                    {
-                                        ledger =
-                                            app_.getInboundLedgers().acquire(
-                                                *hash, missing,
-                                                InboundLedger::fcHISTORY);
-                                        if (!ledger && (missing > 32600) &&
-                                            shouldFetchPack(missing))
-                                        {
-                                            JLOG(m_journal.trace()) <<
-                                                "tryAdvance want fetch pack " <<
-                                                missing;
-                                            fetch_seq_ = missing;
-                                            getFetchPack(*hash, missing);
-                                        }
-                                        else
-                                            JLOG(m_journal.trace()) <<
-                                            "tryAdvance no fetch pack for " <<
-                                            missing;
-                                    }
-                                    else
-                                        JLOG(m_journal.debug()) <<
-                                        "tryAdvance found failed acquire";
-                                }
-                                if (ledger)
-                                {
-                                    auto seq = ledger->info().seq;
-                                    assert(seq == missing);
-                                    JLOG(m_journal.trace())
-                                        << "tryAdvance acquired "
-                                        << ledger->info().seq;
-                                    setFullLedger(
-                                        ledger,
-                                        false,
-                                        false);
-                                    auto const& parent = ledger->info().parentHash;
-
-                                    int fillInProgress;
-                                    {
-                                        ScopedLockType lock(m_mutex);
-                                        mHistLedger = ledger;
-                                        fillInProgress = mFillInProgress;
-                                    }
-
-                                    if (fillInProgress == 0 &&
-                                        getHashByIndex(seq - 1, app_) == parent)
-                                    {
-                                        {
-                                            // Previous ledger is in DB
-                                            ScopedLockType lock(m_mutex);
-                                            mFillInProgress = ledger->info().seq;
-                                        }
-
-                                        app_.getJobQueue().addJob(
-                                            jtADVANCE, "tryFill",
-                                            [this, ledger](Job& j) {
-                                            tryFill(j, ledger);
-                                        });
-                                    }
-
-                                    progress = true;
-                                }
-                                else
-                                {
-                                    try
-                                    {
-                                        for (int i = 0; i < ledger_fetch_size_; ++i)
-                                        {
-                                            std::uint32_t seq = missing - i;
-                                            auto hash2 =
-                                                getLedgerHashForHistory(seq);
-                                            if (hash2)
-                                            {
-                                                assert(hash2->isNonZero());
-                                                app_.getInboundLedgers().acquire
-                                                (*hash2, seq,
-                                                    InboundLedger::fcHISTORY);
-                                            }
-                                        }
-                                    }
-                                    catch (std::exception const&)
-                                    {
-                                        JLOG(m_journal.warn()) <<
-                                            "Threw while prefetching";
-                                    }
-                                }
-                            }
-                            else
-                            {
-                                JLOG(m_journal.fatal()) <<
-                                    "Can't find ledger following prevMissing " <<
-                                    missing;
-                                JLOG(m_journal.fatal()) << "Pub:" <<
-                                    mPubLedgerSeq << " Val:" << mValidLedgerSeq;
-                                JLOG(m_journal.fatal()) << "Ledgers: " <<
-                                    app_.getLedgerMaster().getCompleteLedgers();
-                                clearLedger(missing + 1);
-                                progress = true;
-                            }
-                        }
-                        if (mValidLedgerSeq != mPubLedgerSeq)
-                        {
-                            JLOG(m_journal.debug()) <<
-                                "tryAdvance found last valid changed";
-                            progress = true;
-                        }
+                        JLOG(m_journal.trace()) <<
+                            "advanceThread should acquire";
+                    }
+                    else
+                        missing = boost::none;
+                }
+                if (! missing && mFillInProgress == 0)
+                {
+                    if (auto shardStore = app_.getShardStore())
+                    {
+                        missing = shardStore->prepare(mValidLedgerSeq);
+                        if (missing)
+                            reason = InboundLedger::Reason::SHARD;
+                    }
+                }
+                if(missing)
+                {
+                    fetchForHistory(*missing, progress, reason);
+                    if (mValidLedgerSeq != mPubLedgerSeq)
+                    {
+                        JLOG (m_journal.debug()) <<
+                            "tryAdvance found last valid changed";
+                        progress = true;
                    }
                }
            }
            else
            {
                mHistLedger.reset();
+                mShardLedger.reset();
                JLOG (m_journal.trace()) <<
                    "tryAdvance not fetching history";
            }
@@ -1687,11 +1713,7 @@ void LedgerMaster::doAdvance (ScopedLockType& sl)
                    ScopedUnlockType sul (m_mutex);
                    JLOG (m_journal.debug()) <<
                        "tryAdvance publishing seq " << ledger->info().seq;
-
-                    setFullLedger(
-                        ledger,
-                        true,
-                        true);
+                    setFullLedger(ledger, true, true);
                }

                setPubLedger(ledger);
--- a/src/ripple/app/main/Application.cpp
+++ b/src/ripple/app/main/Application.cpp
@@ -79,6 +79,7 @@ private:
    TreeNodeCache treecache_;
    FullBelowCache fullbelow_;
    NodeStore::Database& db_;
+    bool const shardBacked_;
    beast::Journal j_;

    // missing node handler
@@ -97,7 +98,9 @@ private:
                "Missing node in " << to_string (hash);

            app_.getInboundLedgers ().acquire (
-                hash, seq, InboundLedger::fcGENERIC);
+                hash, seq, shardBacked_ ?
+                InboundLedger::Reason::SHARD :
+                InboundLedger::Reason::GENERIC);
        }
    }

@@ -114,6 +117,8 @@ public:
            collectorManager.collector(),
                fullBelowTargetSize, fullBelowExpirationSeconds)
        , db_ (db)
+        , shardBacked_ (
+            dynamic_cast<NodeStore::DatabaseShard*>(&db) != nullptr)
        , j_ (app.journal("SHAMap"))
    {
    }
@@ -160,6 +165,12 @@ public:
        return db_;
    }

+    bool
+    isShardBacked() const override
+    {
+        return shardBacked_;
+    }
+
    void
    missing_node (std::uint32_t seq) override
    {
@@ -200,9 +211,20 @@ public:
    }

    void
-    missing_node (uint256 const& hash) override
+    missing_node (uint256 const& hash, std::uint32_t seq) override
    {
-        acquire (hash, 0);
+        acquire (hash, seq);
+    }
+
+    void
+    reset () override
+    {
+        {
+            std::lock_guard<std::mutex> l(maxSeqLock);
+            maxSeq = 0;
+        }
+        fullbelow_.reset();
+        treecache_.reset();
    }
 };

@@ -308,7 +330,9 @@ public:
    // These are Stoppable-related
    std::unique_ptr <JobQueue> m_jobQueue;
    std::unique_ptr <NodeStore::Database> m_nodeStore;
+    std::unique_ptr <NodeStore::DatabaseShard> shardStore_;
    detail::AppFamily family_;
+    std::unique_ptr <detail::AppFamily> sFamily_;
    // VFALCO TODO Make OrderBookDB abstract
    OrderBookDB m_orderBookDB;
    std::unique_ptr <PathRequests> m_pathRequests;
@@ -381,9 +405,8 @@ public:
        , m_nodeStoreScheduler (*this)

        , m_shaMapStore (make_SHAMapStore (*this, setup_SHAMapStore (*config_),
-            *this, m_nodeStoreScheduler,
-            logs_->journal ("SHAMapStore"), logs_->journal ("NodeObject"),
-            m_txMaster, *config_))
+            *this, m_nodeStoreScheduler, logs_->journal("SHAMapStore"),
+            logs_->journal("NodeObject"), m_txMaster, *config_))

        , accountIDCache_(128000)

@@ -411,6 +434,9 @@ public:
        , m_nodeStore (
            m_shaMapStore->makeDatabase ("NodeStore.main", 4, *m_jobQueue))

+        , shardStore_ (
+            m_shaMapStore->makeDatabaseShard ("ShardStore", 4, *m_jobQueue))
+
        , family_ (*this, *m_nodeStore, *m_collectorManager)

        , m_orderBookDB (*this, *m_jobQueue)
@@ -493,6 +519,9 @@ public:
        , m_io_latency_sampler (m_collectorManager->collector()->make_event ("ios_latency"),
            logs_->journal("Application"), std::chrono::milliseconds (100), get_io_service())
    {
+        if (shardStore_)
+            sFamily_ = std::make_unique<detail::AppFamily>(
+                *this, *shardStore_, *m_collectorManager);
        add (m_resourceManager.get ());

        //
@@ -546,12 +575,16 @@ public:
        return *m_collectorManager;
    }

-    Family&
-    family() override
+    Family& family() override
    {
        return family_;
    }

+    Family* shardFamily() override
+    {
+        return sFamily_.get();
+    }
+
    TimeKeeper&
    timeKeeper() override
    {
@@ -632,6 +665,11 @@ public:
        return *m_nodeStore;
    }

+    NodeStore::DatabaseShard* getShardStore () override
+    {
+        return shardStore_.get();
+    }
+
    Application::MutexType& getMasterMutex () override
    {
        return m_masterMutex;
@@ -988,15 +1026,21 @@ public:
        // VFALCO TODO fix the dependency inversion using an observer,
        //         have listeners register for "onSweep ()" notification.

-        family().fullbelow().sweep ();
+        family().fullbelow().sweep();
+        if (sFamily_)
+            sFamily_->fullbelow().sweep();
        getMasterTransaction().sweep();
        getNodeStore().sweep();
+        if (shardStore_)
+            shardStore_->sweep();
        getLedgerMaster().sweep();
        getTempNodeCache().sweep();
        getValidations().expire();
        getInboundLedgers().sweep();
        m_acceptedLedgerCache.sweep();
        family().treecache().sweep();
+        if (sFamily_)
+            sFamily_->treecache().sweep();
        cachedSLEs_.expire();

        // Set timer to do another sweep later.
@@ -1017,6 +1061,7 @@ private:
    void addTxnSeqField();
    void addValidationSeqFields();
    bool updateTables ();
+    bool validateShards ();
    void startGenesisLedger ();

    std::shared_ptr<Ledger>
@@ -1199,6 +1244,13 @@ bool ApplicationImp::setup()
    m_ledgerMaster->tune (config_->getSize (siLedgerSize), config_->getSize (siLedgerAge));
    family().treecache().setTargetSize (config_->getSize (siTreeCacheSize));
    family().treecache().setTargetAge (config_->getSize (siTreeCacheAge));
+    if (shardStore_)
+    {
+        shardStore_->tune(config_->getSize(siNodeCacheSize),
+            config_->getSize(siNodeCacheAge));
+        sFamily_->treecache().setTargetSize(config_->getSize(siTreeCacheSize));
+        sFamily_->treecache().setTargetAge(config_->getSize(siTreeCacheAge));
+    }

    //----------------------------------------------------------------------
    //
@@ -1216,6 +1268,9 @@ bool ApplicationImp::setup()
        *config_);
    add (*m_overlay); // add to PropertyStream

+    if (config_->valShards && !validateShards())
+        return false;
+
    validatorSites_->start ();

    // start first consensus round
@@ -1624,7 +1679,7 @@ bool ApplicationImp::loadOldLedger (
                {
                    // Try to build the ledger from the back end
                    auto il = std::make_shared <InboundLedger> (
-                        *this, hash, 0, InboundLedger::fcGENERIC,
+                        *this, hash, 0, InboundLedger::Reason::GENERIC,
                        stopwatch());
                    if (il->checkLocal ())
                        loadLedger = il->getLedger ();
@@ -1664,7 +1719,7 @@ bool ApplicationImp::loadOldLedger (
                // Try to build the ledger from the back end
                auto il = std::make_shared <InboundLedger> (
                    *this, replayLedger->info().parentHash,
-                    0, InboundLedger::fcGENERIC, stopwatch());
+                    0, InboundLedger::Reason::GENERIC, stopwatch());

                if (il->checkLocal ())
                    loadLedger = il->getLedger ();
@@ -2008,6 +2063,32 @@ bool ApplicationImp::updateTables ()
    return true;
 }

+bool ApplicationImp::validateShards()
+{
+    if (!m_overlay)
+        Throw<std::runtime_error>("no overlay");
+    if(config_->standalone())
+    {
+        JLOG(m_journal.fatal()) <<
+            "Shard validation cannot be run in standalone";
+        return false;
+    }
+    if (config_->section(ConfigSection::shardDatabase()).empty())
+    {
+        JLOG (m_journal.fatal()) <<
+            "The [shard_db] configuration setting must be set";
+        return false;
+    }
+    if (!shardStore_)
+    {
+        JLOG(m_journal.fatal()) <<
+            "Invalid [shard_db] configuration";
+        return false;
+    }
+    shardStore_->validate();
+    return true;
+}
+
 void ApplicationImp::setMaxDisallowedLedger()
 {
    boost::optional <LedgerIndex> seq;
--- a/src/ripple/app/main/Application.h
+++ b/src/ripple/app/main/Application.h
@@ -35,7 +35,7 @@ namespace ripple {

 namespace unl { class Manager; }
 namespace Resource { class Manager; }
-namespace NodeStore { class Database; }
+namespace NodeStore { class Database; class DatabaseShard; }

 // VFALCO TODO Fix forward declares required for header dependency loops
 class AmendmentTable;
@@ -116,32 +116,41 @@ public:
    // ---
    //

-    virtual Logs& logs() = 0;
-    virtual Config& config() = 0;
-    virtual boost::asio::io_service& getIOService () = 0;
-    virtual CollectorManager&       getCollectorManager () = 0;
-    virtual Family&                 family() = 0;
-    virtual TimeKeeper&             timeKeeper() = 0;
-    virtual JobQueue&               getJobQueue () = 0;
-    virtual NodeCache&              getTempNodeCache () = 0;
-    virtual CachedSLEs&             cachedSLEs() = 0;
-    virtual AmendmentTable&         getAmendmentTable() = 0;
-    virtual HashRouter&             getHashRouter () = 0;
-    virtual LoadFeeTrack&           getFeeTrack () = 0;
-    virtual LoadManager&            getLoadManager () = 0;
-    virtual Overlay&                overlay () = 0;
-    virtual TxQ&                    getTxQ() = 0;
-    virtual ValidatorList&          validators () = 0;
-    virtual ValidatorSite&          validatorSites () = 0;
-    virtual ManifestCache&          validatorManifests () = 0;
-    virtual ManifestCache&          publisherManifests () = 0;
-    virtual Cluster&                cluster () = 0;
-    virtual RCLValidations&         getValidations () = 0;
-    virtual NodeStore::Database&    getNodeStore () = 0;
-    virtual InboundLedgers&         getInboundLedgers () = 0;
-    virtual InboundTransactions&    getInboundTransactions () = 0;
-    virtual TaggedCache <uint256, AcceptedLedger>&
-                                    getAcceptedLedgerCache () = 0;
+    virtual Logs&                   logs() = 0;
+    virtual Config&                 config() = 0;
+
+    virtual
+    boost::asio::io_service&
+    getIOService () = 0;
+
+    virtual CollectorManager&           getCollectorManager () = 0;
+    virtual Family&                     family() = 0;
+    virtual Family*                     shardFamily() = 0;
+    virtual TimeKeeper&                 timeKeeper() = 0;
+    virtual JobQueue&                   getJobQueue () = 0;
+    virtual NodeCache&                  getTempNodeCache () = 0;
+    virtual CachedSLEs&                 cachedSLEs() = 0;
+    virtual AmendmentTable&             getAmendmentTable() = 0;
+    virtual HashRouter&                 getHashRouter () = 0;
+    virtual LoadFeeTrack&               getFeeTrack () = 0;
+    virtual LoadManager&                getLoadManager () = 0;
+    virtual Overlay&                    overlay () = 0;
+    virtual TxQ&                        getTxQ() = 0;
+    virtual ValidatorList&              validators () = 0;
+    virtual ValidatorSite&              validatorSites () = 0;
+    virtual ManifestCache&              validatorManifests () = 0;
+    virtual ManifestCache&              publisherManifests () = 0;
+    virtual Cluster&                    cluster () = 0;
+    virtual RCLValidations&             getValidations () = 0;
+    virtual NodeStore::Database&        getNodeStore () = 0;
+    virtual NodeStore::DatabaseShard*   getShardStore() = 0;
+    virtual InboundLedgers&             getInboundLedgers () = 0;
+    virtual InboundTransactions&        getInboundTransactions () = 0;
+
+    virtual
+    TaggedCache <uint256, AcceptedLedger>&
+    getAcceptedLedgerCache () = 0;
+
    virtual LedgerMaster&           getLedgerMaster () = 0;
    virtual NetworkOPs&             getOPs () = 0;
    virtual OrderBookDB&            getOrderBookDB () = 0;
@@ -162,10 +171,12 @@ public:
    virtual AccountIDCache const&   accountIDCache() const = 0;
    virtual OpenLedger&             openLedger() = 0;
    virtual OpenLedger const&       openLedger() const = 0;
-    virtual DatabaseCon& getTxnDB () = 0;
-    virtual DatabaseCon& getLedgerDB () = 0;
+    virtual DatabaseCon&            getTxnDB () = 0;
+    virtual DatabaseCon&            getLedgerDB () = 0;

-    virtual std::chrono::milliseconds getIOLatency () = 0;
+    virtual
+    std::chrono::milliseconds
+    getIOLatency () = 0;

    virtual bool serverOkay (std::string& reason) = 0;

--- a/src/ripple/app/main/Main.cpp
+++ b/src/ripple/app/main/Main.cpp
@@ -270,6 +270,12 @@ int run (int argc, char** argv)
        importText += ConfigSection::nodeDatabase ();
        importText += "] configuration file section).";
    }
+    std::string shardsText;
+    {
+        shardsText += "Validate an existing shard database (specified in the [";
+        shardsText += ConfigSection::shardDatabase();
+        shardsText += "] configuration file section).";
+    }

    // Set up option parsing.
    //
@@ -303,6 +309,7 @@ int run (int argc, char** argv)
    ("debug", "Enable normally suppressed debug logging")
    ("fg", "Run in the foreground.")
    ("import", importText.c_str ())
+    ("shards", shardsText.c_str ())
    ("version", "Display the build version.")
    ;

@@ -402,6 +409,9 @@ int run (int argc, char** argv)
    if (vm.count ("import"))
        config->doImport = true;

+    if (vm.count ("shards"))
+        config->valShards = true;
+
    if (vm.count ("ledger"))
    {
        config->START_LEDGER = vm["ledger"].as<std::string> ();
--- a/src/ripple/app/misc/NetworkOPs.cpp
+++ b/src/ripple/app/misc/NetworkOPs.cpp
@@ -1368,7 +1368,7 @@ bool NetworkOPsImp::checkLastClosedLedger (

    if (!consensus)
        consensus = app_.getInboundLedgers().acquire (
-            closedLedger, 0, InboundLedger::fcCONSENSUS);
+            closedLedger, 0, InboundLedger::Reason::CONSENSUS);

    if (consensus &&
        ! m_ledgerMaster.isCompatible (*consensus, m_journal.debug(),
--- a/src/ripple/app/misc/SHAMapStore.h
+++ b/src/ripple/app/misc/SHAMapStore.h
@@ -48,6 +48,7 @@ public:
        std::uint32_t deleteBatch = 100;
        std::uint32_t backOff = 100;
        std::int32_t ageThreshold = 60;
+        Section shardDatabase;
    };

    SHAMapStore (Stoppable& parent) : Stoppable ("SHAMapStore", parent) {}
@@ -63,6 +64,10 @@ public:
            std::string const& name,
            std::int32_t readThreads, Stoppable& parent) = 0;

+    virtual std::unique_ptr <NodeStore::DatabaseShard> makeDatabaseShard(
+        std::string const& name, std::int32_t readThreads,
+            Stoppable& parent) = 0;
+
    /** Highest ledger that may be deleted. */
    virtual LedgerIndex setCanDelete (LedgerIndex canDelete) = 0;

--- a/src/ripple/app/misc/SHAMapStoreImp.cpp
+++ b/src/ripple/app/misc/SHAMapStoreImp.cpp
@@ -19,11 +19,13 @@

 #include <BeastConfig.h>

-#include <ripple/app/misc/SHAMapStoreImp.h>
 #include <ripple/app/ledger/TransactionMaster.h>
 #include <ripple/app/misc/NetworkOPs.h>
-#include <ripple/core/ConfigSections.h>
+#include <ripple/app/misc/SHAMapStoreImp.h>
 #include <ripple/beast/core/CurrentThreadName.h>
+#include <ripple/core/ConfigSections.h>
+#include <ripple/nodestore/impl/DatabaseRotatingImp.h>
+#include <ripple/nodestore/impl/DatabaseShardImp.h>

 namespace ripple {
 void SHAMapStoreImp::SavedStateDB::init (BasicConfig const& config,
@@ -199,6 +201,41 @@ SHAMapStoreImp::SHAMapStoreImp (

        dbPaths();
    }
+    if (! setup_.shardDatabase.empty())
+    {
+        boost::filesystem::path dbPath =
+            get<std::string>(setup_.shardDatabase, "path");
+        if (dbPath.empty())
+            Throw<std::runtime_error>("shard path missing");
+        if (boost::filesystem::exists(dbPath))
+        {
+            if (! boost::filesystem::is_directory(dbPath))
+                Throw<std::runtime_error>("shard db path must be a directory.");
+        }
+        else
+            boost::filesystem::create_directories(dbPath);
+
+        auto const maxDiskSpace = get<std::uint64_t>(
+            setup_.shardDatabase, "max_size_gb", 0);
+        // Must be large enough for one shard
+        if (maxDiskSpace < 3)
+            Throw<std::runtime_error>("max_size_gb too small");
+        if ((maxDiskSpace << 30) < maxDiskSpace)
+            Throw<std::runtime_error>("overflow max_size_gb");
+
+        std::uint32_t lps;
+        if (get_if_exists<std::uint32_t>(
+                setup_.shardDatabase, "ledgers_per_shard", lps))
+        {
+            // ledgers_per_shard to be set only in standalone for testing
+            if (! setup_.standalone)
+                Throw<std::runtime_error>(
+                    "ledgers_per_shard only honored in stand alone");
+            if (lps == 0 || lps % 256 != 0)
+                Throw<std::runtime_error>(
+                    "ledgers_per_shard must be a multiple of 256");
+        }
+    }
 }

 std::unique_ptr <NodeStore::Database>
@@ -206,39 +243,51 @@ SHAMapStoreImp::makeDatabase (std::string const& name,
        std::int32_t readThreads, Stoppable& parent)
 {
    std::unique_ptr <NodeStore::Database> db;
-
    if (setup_.deleteInterval)
    {
        SavedState state = state_db_.getState();
-
-        std::shared_ptr <NodeStore::Backend> writableBackend (
-                makeBackendRotating (state.writableDb));
-        std::shared_ptr <NodeStore::Backend> archiveBackend (
-                makeBackendRotating (state.archiveDb));
-
-        fdlimit_ = writableBackend->fdlimit() + archiveBackend->fdlimit();
-
-        std::unique_ptr <NodeStore::DatabaseRotating> dbr =
-            makeDatabaseRotating (name, readThreads, parent,
-                writableBackend, archiveBackend);
-
-        if (!state.writableDb.size())
+        auto writableBackend = makeBackendRotating(state.writableDb);
+        auto archiveBackend = makeBackendRotating(state.archiveDb);
+        if (! state.writableDb.size())
        {
            state.writableDb = writableBackend->getName();
            state.archiveDb = archiveBackend->getName();
            state_db_.setState (state);
        }

-        database_ = dbr.get();
-        db.reset (dynamic_cast <NodeStore::Database*>(dbr.release()));
+        // Create NodeStore with two backends to allow online deletion of data
+        auto dbr = std::make_unique<NodeStore::DatabaseRotatingImp>(
+            "NodeStore.main", scheduler_, readThreads, parent,
+                std::move(writableBackend), std::move(archiveBackend),
+                    nodeStoreJournal_);
+        fdlimit_ += dbr->fdlimit();
+        dbRotating_ = dbr.get();
+        db.reset(dynamic_cast<NodeStore::Database*>(dbr.release()));
    }
    else
    {
        db = NodeStore::Manager::instance().make_Database (name, scheduler_,
            readThreads, parent, setup_.nodeDatabase, nodeStoreJournal_);
-        fdlimit_ = db->fdlimit();
+        fdlimit_ += db->fdlimit();
    }
+    return db;
+}

+std::unique_ptr<NodeStore::DatabaseShard>
+SHAMapStoreImp::makeDatabaseShard(std::string const& name,
+    std::int32_t readThreads, Stoppable& parent)
+{
+    std::unique_ptr<NodeStore::DatabaseShard> db;
+    if(! setup_.shardDatabase.empty())
+    {
+        db = std::make_unique<NodeStore::DatabaseShardImp>(
+            app_, name, parent, scheduler_, readThreads,
+                setup_.shardDatabase, app_.journal("ShardStore"));
+        if (db->init())
+            fdlimit_ += db->fdlimit();
+        else
+            db.reset();
+    }
    return db;
 }

@@ -277,8 +326,8 @@ bool
 SHAMapStoreImp::copyNode (std::uint64_t& nodeCount,
        SHAMapAbstractNode const& node)
 {
-    // Copy a single record from node to database_
-    database_->fetchNode (node.getNodeHash().as_uint256());
+    // Copy a single record from node to dbRotating_
+    dbRotating_->fetch(node.getNodeHash().as_uint256(), node.getSeq());
    if (! (++nodeCount % checkHealthInterval_))
    {
        if (health())
@@ -399,11 +448,9 @@ SHAMapStoreImp::run()
                    ;
            }

-            std::shared_ptr <NodeStore::Backend> newBackend =
-                    makeBackendRotating();
+            auto newBackend = makeBackendRotating();
            JLOG(journal_.debug()) << validatedSeq << " new backend "
                    << newBackend->getName();
-            std::shared_ptr <NodeStore::Backend> oldBackend;

            clearCaches (validatedSeq);
            switch (health())
@@ -419,15 +466,17 @@ SHAMapStoreImp::run()
            }

            std::string nextArchiveDir =
-                    database_->getWritableBackend()->getName();
+                dbRotating_->getWritableBackend()->getName();
            lastRotated = validatedSeq;
+            std::unique_ptr<NodeStore::Backend> oldBackend;
            {
-                std::lock_guard <std::mutex> lock (database_->peekMutex());
+                std::lock_guard <std::mutex> lock (dbRotating_->peekMutex());

                state_db_.setState (SavedState {newBackend->getName(),
                        nextArchiveDir, lastRotated});
                clearCaches (validatedSeq);
-                oldBackend = database_->rotateBackends (newBackend);
+                oldBackend = dbRotating_->rotateBackends(
+                    std::move(newBackend));
            }
            JLOG(journal_.debug()) << "finished rotation " << validatedSeq;

@@ -498,7 +547,7 @@ SHAMapStoreImp::dbPaths()
    }
 }

-std::shared_ptr <NodeStore::Backend>
+std::unique_ptr <NodeStore::Backend>
 SHAMapStoreImp::makeBackendRotating (std::string path)
 {
    boost::filesystem::path newPath;
@@ -517,19 +566,10 @@ SHAMapStoreImp::makeBackendRotating (std::string path)
    }
    parameters.set("path", newPath.string());

-    return NodeStore::Manager::instance().make_Backend (parameters, scheduler_,
-            nodeStoreJournal_);
-}
-
-std::unique_ptr <NodeStore::DatabaseRotating>
-SHAMapStoreImp::makeDatabaseRotating (std::string const& name,
-        std::int32_t readThreads,  Stoppable& parent,
-        std::shared_ptr <NodeStore::Backend> writableBackend,
-        std::shared_ptr <NodeStore::Backend> archiveBackend) const
-{
-    return NodeStore::Manager::instance().make_DatabaseRotating (
-        name, scheduler_, readThreads, parent,
-        writableBackend, archiveBackend, nodeStoreJournal_);
+    auto backend {NodeStore::Manager::instance().make_Backend(
+        parameters, scheduler_, nodeStoreJournal_)};
+    backend->open();
+    return backend;
 }

 bool
@@ -583,7 +623,7 @@ SHAMapStoreImp::clearCaches (LedgerIndex validatedSeq)
 void
 SHAMapStoreImp::freshenCaches()
 {
-    if (freshenCache (database_->getPositiveCache()))
+    if (freshenCache (dbRotating_->getPositiveCache()))
        return;
    if (freshenCache (*treeNodeCache_))
        return;
@@ -825,12 +865,13 @@ setup_SHAMapStore (Config const& c)
    get_if_exists (setup.nodeDatabase, "backOff", setup.backOff);
    get_if_exists (setup.nodeDatabase, "age_threshold", setup.ageThreshold);

+    setup.shardDatabase = c.section(ConfigSection::shardDatabase());
    return setup;
 }

 std::unique_ptr<SHAMapStore>
 make_SHAMapStore (Application& app,
-        SHAMapStore::Setup const& s,
+        SHAMapStore::Setup const& setup,
        Stoppable& parent,
        NodeStore::Scheduler& scheduler,
        beast::Journal journal,
@@ -838,9 +879,8 @@ make_SHAMapStore (Application& app,
        TransactionMaster& transactionMaster,
        BasicConfig const& config)
 {
-    return std::make_unique<SHAMapStoreImp>(app, s, parent, scheduler,
-            journal, nodeStoreJournal, transactionMaster,
-            config);
+    return std::make_unique<SHAMapStoreImp>(app, setup, parent, scheduler,
+        journal, nodeStoreJournal, transactionMaster, config);
 }

 }
--- a/src/ripple/app/misc/SHAMapStoreImp.h
+++ b/src/ripple/app/misc/SHAMapStoreImp.h
@@ -27,7 +27,6 @@
 #include <condition_variable>
 #include <thread>

-
 namespace ripple {

 class NetworkOPs;
@@ -87,7 +86,7 @@ private:
    NodeStore::Scheduler& scheduler_;
    beast::Journal journal_;
    beast::Journal nodeStoreJournal_;
-    NodeStore::DatabaseRotating* database_ = nullptr;
+    NodeStore::DatabaseRotating* dbRotating_ = nullptr;
    SavedStateDB state_db_;
    std::thread thread_;
    bool stop_ = false;
@@ -136,6 +135,10 @@ public:
            std::string const&name,
            std::int32_t readThreads, Stoppable& parent) override;

+    std::unique_ptr <NodeStore::DatabaseShard>
+    makeDatabaseShard(std::string const& name,
+        std::int32_t readThreads, Stoppable& parent) override;
+
    LedgerIndex
    setCanDelete (LedgerIndex seq) override
    {
@@ -176,24 +179,9 @@ private:
    bool copyNode (std::uint64_t& nodeCount, SHAMapAbstractNode const &node);
    void run();
    void dbPaths();
-    std::shared_ptr <NodeStore::Backend> makeBackendRotating (
-            std::string path = std::string());
-    /**
-     * Creates a NodeStore with two
-     * backends to allow online deletion of data.
-     *
-     * @param name A diagnostic label for the database.
-     * @param readThreads The number of async read threads to create
-     * @param writableBackend backend for writing
-     * @param archiveBackend backend for archiving
-     *
-     * @return The opened database.
-     */
-    std::unique_ptr <NodeStore::DatabaseRotating>
-    makeDatabaseRotating (std::string const&name,
-            std::int32_t readThreads, Stoppable& parent,
-            std::shared_ptr <NodeStore::Backend> writableBackend,
-            std::shared_ptr <NodeStore::Backend> archiveBackend) const;
+
+    std::unique_ptr<NodeStore::Backend>
+    makeBackendRotating (std::string path = std::string());

    template <class CacheInstance>
    bool
@@ -203,7 +191,7 @@ private:

        for (auto const& key: cache.getKeys())
        {
-            database_->fetchNode (key);
+            dbRotating_->fetch(key, 0);
            if (! (++check % checkHealthInterval_) && health())
                return true;
        }
--- a/src/ripple/basics/KeyCache.h
+++ b/src/ripple/basics/KeyCache.h
@@ -155,6 +155,14 @@ public:
        m_map.clear ();
    }

+    void reset ()
+    {
+        lock_guard lock(m_mutex);
+        m_map.clear();
+        m_stats.hits = 0;
+        m_stats.misses = 0;
+    }
+
    void setTargetSize (size_type s)
    {
        lock_guard lock (m_mutex);
--- a/src/ripple/basics/TaggedCache.h
+++ b/src/ripple/basics/TaggedCache.h
@@ -146,13 +146,6 @@ public:
        return m_hits * (100.0f / std::max (1.0f, total));
    }

-    void clearStats ()
-    {
-        lock_guard lock (m_mutex);
-        m_hits = 0;
-        m_misses = 0;
-    }
-
    void clear ()
    {
        lock_guard lock (m_mutex);
@@ -160,6 +153,15 @@ public:
        m_cache_count = 0;
    }

+    void reset ()
+    {
+        lock_guard lock (m_mutex);
+        m_cache.clear();
+        m_cache_count = 0;
+        m_hits = 0;
+        m_misses = 0;
+    }
+
    void sweep ()
    {
        int cacheRemovals = 0;
@@ -476,7 +478,7 @@ public:
        return m_mutex;
    }

-    std::vector <key_type> getKeys ()
+    std::vector <key_type> getKeys () const
    {
        std::vector <key_type> v;

--- a/src/ripple/core/Config.h
+++ b/src/ripple/core/Config.h
@@ -110,6 +110,7 @@ private:

 public:
    bool doImport = false;
+    bool valShards = false;
    bool ELB_SUPPORT = false;

    std::vector<std::string>    IPS;                    // Peer IPs from rippled.cfg.
--- a/src/ripple/core/ConfigSections.h
+++ b/src/ripple/core/ConfigSections.h
@@ -28,6 +28,7 @@ namespace ripple {
 struct ConfigSection
 {
    static std::string nodeDatabase ()       { return "node_db"; }
+    static std::string shardDatabase ()      { return "shard_db"; }
    static std::string importNodeDatabase () { return "import_db"; }
 };

--- a/src/ripple/nodestore/Backend.h
+++ b/src/ripple/nodestore/Backend.h
@@ -50,6 +50,11 @@ public:
    */
    virtual std::string getName() = 0;

+    /** Open the backend.
+        This allows the caller to catch exceptions.
+    */
+    virtual void open() = 0;
+
    /** Close the backend.
        This allows the caller to catch exceptions.
    */
--- a/src/ripple/nodestore/Database.h
+++ b/src/ripple/nodestore/Database.h
@@ -1,7 +1,7 @@
 //------------------------------------------------------------------------------
 /*
    This file is part of rippled: https://github.com/ripple/rippled
-    Copyright (c) 2012, 2013 Ripple Labs Inc.
+    Copyright (c) 2012, 2017 Ripple Labs Inc.

    Permission to use, copy, modify, and/or distribute this software for any
    purpose  with  or without fee is hereby granted, provided that the above
@@ -21,11 +21,19 @@
 #define RIPPLE_NODESTORE_DATABASE_H_INCLUDED

 #include <ripple/basics/TaggedCache.h>
+#include <ripple/basics/KeyCache.h>
 #include <ripple/core/Stoppable.h>
-#include <ripple/nodestore/NodeObject.h>
 #include <ripple/nodestore/Backend.h>
+#include <ripple/nodestore/impl/Tuning.h>
+#include <ripple/nodestore/Scheduler.h>
+#include <ripple/nodestore/NodeObject.h>
+
+#include <thread>

 namespace ripple {
+
+class Ledger;
+
 namespace NodeStore {

 /** Persistency layer for NodeObject
@@ -50,22 +58,56 @@ public:

        @param name The Stoppable name for this Database.
        @param parent The parent Stoppable.
+        @param scheduler The scheduler to use for performing asynchronous tasks.
+        @param readThreads The number of async read threads to create.
+        @param journal Destination for logging output.
    */
-    Database (std::string name, Stoppable& parent)
-        : Stoppable (std::move (name), parent)
-    { }
+    Database(std::string name, Stoppable& parent, Scheduler& scheduler,
+        int readThreads, beast::Journal j);

    /** Destroy the node store.
        All pending operations are completed, pending writes flushed,
        and files closed before this returns.
    */
-    virtual ~Database() = default;
+    virtual
+    ~Database();

    /** Retrieve the name associated with this backend.
        This is used for diagnostics and may not reflect the actual path
        or paths used by the underlying backend.
    */
-    virtual std::string getName () const = 0;
+    virtual
+    std::string
+    getName() const = 0;
+
+    /** Import objects from another database. */
+    virtual
+    void
+    import(Database& source) = 0;
+
+    /** Retrieve the estimated number of pending write operations.
+        This is used for diagnostics.
+    */
+    virtual
+    std::int32_t
+    getWriteLoad() const = 0;
+
+    /** Store the object.
+
+        The caller's Blob parameter is overwritten.
+
+        @param type The type of object.
+        @param data The payload of the object. The caller's
+                    variable is overwritten.
+        @param hash The 256-bit hash of the payload data.
+        @param seq The sequence of the ledger the object belongs to.
+
+        @return `true` if the object was stored?
+    */
+    virtual
+    void
+    store(NodeObjectType type, Blob&& data,
+        uint256 const& hash, std::uint32_t seq) = 0;

    /** Fetch an object.
        If the object is known to be not in the database, isn't found in the
@@ -74,9 +116,12 @@ public:

        @note This can be called concurrently.
        @param hash The key of the object to retrieve.
+        @param seq The sequence of the ledger where the object is stored.
        @return The object, or nullptr if it couldn't be retrieved.
    */
-    virtual std::shared_ptr<NodeObject> fetch (uint256 const& hash) = 0;
+    virtual
+    std::shared_ptr<NodeObject>
+    fetch(uint256 const& hash, std::uint32_t seq) = 0;

    /** Fetch an object without waiting.
        If I/O is required to determine whether or not the object is present,
@@ -86,35 +131,143 @@ public:

        @note This can be called concurrently.
        @param hash The key of the object to retrieve
+        @param seq The sequence of the ledger where the object is stored.
        @param object The object retrieved
        @return Whether the operation completed
    */
-    virtual bool asyncFetch (uint256 const& hash, std::shared_ptr<NodeObject>& object) = 0;
+    virtual
+    bool
+    asyncFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<NodeObject>& object) = 0;
+
+    /** Copies a ledger stored in a different database to this one.
+
+        @param ledger The ledger to copy.
+        @return true if the operation was successful
+    */
+    virtual
+    bool
+    copyLedger(std::shared_ptr<Ledger const> const& ledger) = 0;

    /** Wait for all currently pending async reads to complete.
    */
-    virtual void waitReads () = 0;
+    void
+    waitReads();

    /** Get the maximum number of async reads the node store prefers.
+
+        @param seq A ledger sequence specifying a shard to query.
        @return The number of async reads preferred.
+        @note The sequence is only used with the shard store.
    */
-    virtual int getDesiredAsyncReadCount () = 0;
+    virtual
+    int
+    getDesiredAsyncReadCount(std::uint32_t seq) = 0;

-    /** Store the object.
+    /** Get the positive cache hits to total attempts ratio. */
+    virtual
+    float
+    getCacheHitRate() = 0;

-        The caller's Blob parameter is overwritten.
+    /** Set the maximum number of entries and maximum cache age for both caches.

-        @param type The type of object.
-        @param ledgerIndex The ledger in which the object appears.
-        @param data The payload of the object. The caller's
-                    variable is overwritten.
-        @param hash The 256-bit hash of the payload data.
-
-        @return `true` if the object was stored?
+        @param size Number of cache entries (0 = ignore)
+        @param age Maximum cache age in seconds
    */
-    virtual void store (NodeObjectType type,
-                        Blob&& data,
-                        uint256 const& hash) = 0;
+    virtual
+    void
+    tune(int size, int age) = 0;
+
+    /** Remove expired entries from the positive and negative caches. */
+    virtual
+    void
+    sweep() = 0;
+
+    /** Gather statistics pertaining to read and write activities.
+
+        @return The total read and written bytes.
+     */
+    std::uint32_t
+    getStoreCount() const { return storeCount_; }
+
+    std::uint32_t
+    getFetchTotalCount() const { return fetchTotalCount_; }
+
+    std::uint32_t
+    getFetchHitCount() const { return fetchHitCount_; }
+
+    std::uint32_t
+    getStoreSize() const { return storeSz_; }
+
+    std::uint32_t
+    getFetchSize() const { return fetchSz_; }
+
+    /** Return the number of files needed by our backend(s) */
+    int
+    fdlimit() const { return fdLimit_; }
+
+    void
+    onStop();
+
+protected:
+    beast::Journal j_;
+    Scheduler& scheduler_;
+    int fdLimit_ {0};
+
+    void
+    stopThreads();
+
+    void
+    storeStats(size_t sz)
+    {
+        ++storeCount_;
+        storeSz_ += sz;
+    }
+
+    void
+    asyncFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<TaggedCache<uint256, NodeObject>> const& pCache,
+            std::shared_ptr<KeyCache<uint256>> const& nCache);
+
+    std::shared_ptr<NodeObject>
+    fetchInternal(uint256 const& hash, Backend& backend);
+
+    void
+    importInternal(Database& source, Backend& dest);
+
+    std::shared_ptr<NodeObject>
+    doFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<TaggedCache<uint256, NodeObject>> const& pCache,
+            std::shared_ptr<KeyCache<uint256>> const& nCache, bool isAsync);
+
+private:
+    std::atomic<std::uint32_t> storeCount_ {0};
+    std::atomic<std::uint32_t> fetchTotalCount_ {0};
+    std::atomic<std::uint32_t> fetchHitCount_ {0};
+    std::atomic<std::uint32_t> storeSz_ {0};
+    std::atomic<std::uint32_t> fetchSz_ {0};
+
+    std::mutex readLock_;
+    std::condition_variable readCondVar_;
+    std::condition_variable readGenCondVar_;
+
+    // reads to do
+    std::map<uint256, std::tuple<std::uint32_t,
+        std::weak_ptr<TaggedCache<uint256, NodeObject>>,
+            std::weak_ptr<KeyCache<uint256>>>> read_;
+
+    // last read
+    uint256 readLastHash_;
+
+    std::vector<std::thread> readThreads_;
+    bool readShut_ {false};
+
+    // current read generation
+    uint64_t readGen_ {0};
+
+    virtual
+    std::shared_ptr<NodeObject>
+    fetchFrom(uint256 const& hash, std::uint32_t seq) = 0;

    /** Visit every object in the database
        This is usually called during import.
@@ -123,40 +276,12 @@ public:
                or other methods.
        @see import
    */
-    virtual void for_each(std::function <void(std::shared_ptr<NodeObject>)> f) = 0;
+    virtual
+    void
+    for_each(std::function <void(std::shared_ptr<NodeObject>)> f) = 0;

-    /** Import objects from another database. */
-    virtual void import (Database& source) = 0;
-
-    /** Retrieve the estimated number of pending write operations.
-        This is used for diagnostics.
-    */
-    virtual std::int32_t getWriteLoad() const = 0;
-
-    /** Get the positive cache hits to total attempts ratio. */
-    virtual float getCacheHitRate () = 0;
-
-    /** Set the maximum number of entries and maximum cache age for both caches.
-
-        @param size Number of cache entries (0 = ignore)
-        @param age Maximum cache age in seconds
-    */
-    virtual void tune (int size, int age) = 0;
-
-    /** Remove expired entries from the positive and negative caches. */
-    virtual void sweep () = 0;
-
-    /** Gather statistics pertaining to read and write activities.
-        Return the reads and writes, and total read and written bytes.
-     */
-    virtual std::uint32_t getStoreCount () const = 0;
-    virtual std::uint32_t getFetchTotalCount () const = 0;
-    virtual std::uint32_t getFetchHitCount () const = 0;
-    virtual std::uint32_t getStoreSize () const = 0;
-    virtual std::uint32_t getFetchSize () const = 0;
-
-    /** Return the number of files needed by our backend */
-    virtual int fdlimit() const = 0;
+    void
+    threadEntry();
 };

 }
--- a/src/ripple/nodestore/DatabaseRotating.h
+++ b/src/ripple/nodestore/DatabaseRotating.h
@@ -30,24 +30,27 @@ namespace NodeStore {
 * rotated in. Old ones are rotated out and deleted.
 */

-class DatabaseRotating
+class DatabaseRotating : public Database
 {
 public:
-    virtual ~DatabaseRotating() = default;
+    DatabaseRotating(std::string const& name, Stoppable& parent,
+        Scheduler& scheduler, int readThreads, beast::Journal journal)
+        : Database(name, parent, scheduler, readThreads, journal)
+    {}

-    virtual TaggedCache <uint256, NodeObject>& getPositiveCache() = 0;
+    virtual
+    TaggedCache<uint256, NodeObject> const&
+    getPositiveCache() = 0;

    virtual std::mutex& peekMutex() const = 0;

-    virtual std::shared_ptr <Backend> const& getWritableBackend() const = 0;
+    virtual
+    std::unique_ptr<Backend> const&
+    getWritableBackend() const = 0;

-    virtual std::shared_ptr <Backend> const& getArchiveBackend () const = 0;
-
-    virtual std::shared_ptr <Backend> rotateBackends (
-            std::shared_ptr <Backend> const& newBackend) = 0;
-
-    /** Ensure that node is in writableBackend */
-    virtual std::shared_ptr<NodeObject> fetchNode (uint256 const& hash) = 0;
+    virtual
+    std::unique_ptr<Backend>
+    rotateBackends(std::unique_ptr<Backend> newBackend) = 0;
 };

 }
--- a/src/ripple/nodestore/DatabaseShard.h
+++ b/src/ripple/nodestore/DatabaseShard.h
@@ -0,0 +1,177 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2017 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#ifndef RIPPLE_NODESTORE_DATABASESHARD_H_INCLUDED
+#define RIPPLE_NODESTORE_DATABASESHARD_H_INCLUDED
+
+#include <ripple/nodestore/Database.h>
+#include <ripple/app/ledger/Ledger.h>
+#include <ripple/nodestore/Types.h>
+
+#include <boost/optional.hpp>
+
+#include <memory>
+
+namespace ripple {
+namespace NodeStore {
+
+/** A collection of historical shards
+*/
+class DatabaseShard : public Database
+{
+public:
+    /** Construct a shard store
+
+        @param name The Stoppable name for this Database
+        @param parent The parent Stoppable
+        @param scheduler The scheduler to use for performing asynchronous tasks
+        @param readThreads The number of async read threads to create
+        @param config The configuration for the database
+        @param journal Destination for logging output
+    */
+    DatabaseShard(std::string const& name, Stoppable& parent,
+        Scheduler& scheduler, int readThreads,
+        Section const& config, beast::Journal journal)
+        : Database(name, parent, scheduler, readThreads, journal)
+    {
+        get_if_exists<std::uint32_t>(config, "ledgers_per_shard", lps_);
+    }
+
+    /** Initialize the database
+
+        @return `true` if the database initialized without error
+    */
+    virtual
+    bool
+    init() = 0;
+
+    /** Prepare to store a new ledger in the shard
+
+        @param validLedgerSeq the index of the maximum valid ledgers
+        @return if a ledger should be fetched and stored, then returns the ledger
+                index of the ledger to request. Otherwise returns boost::none.
+                Some reasons this may return boost::none are: this database does
+                not store shards, all shards are are stored and full, max allowed
+                disk space would be exceeded, or a ledger was recently requested
+                and not enough time has passed between requests.
+        @implNote adds a new writable shard if necessary
+    */
+    virtual
+    boost::optional<std::uint32_t>
+    prepare(std::uint32_t validLedgerSeq) = 0;
+
+    /** Fetch a ledger from the shard store
+
+        @param hash The key of the ledger to retrieve
+        @param seq The sequence of the ledger
+        @return The ledger if found, nullptr otherwise
+    */
+    virtual
+    std::shared_ptr<Ledger>
+    fetchLedger(uint256 const& hash, std::uint32_t seq) = 0;
+
+    /** Notifies the database that the given ledger has been
+        fully acquired and stored.
+
+        @param ledger The stored ledger to be marked as complete
+    */
+    virtual
+    void
+    setStored(std::shared_ptr<Ledger const> const& ledger) = 0;
+
+    /** Query if a ledger with the given sequence is stored
+
+        @param seq The ledger sequence to check if stored
+        @return `true` if the ledger is stored
+    */
+    virtual
+    bool
+    contains(std::uint32_t seq) = 0;
+
+    /** Query which complete shards are stored
+
+        @return the indexes of complete shards
+    */
+    virtual
+    std::string
+    getCompleteShards() = 0;
+
+    /** Verifies shard store data is valid.
+
+        @param app The application object
+    */
+    virtual
+    void
+    validate() = 0;
+
+    /** @return The number of ledgers stored in a shard
+    */
+    static
+    std::uint32_t
+    ledgersPerShard()
+    {
+        return lps_;
+    }
+
+    /** Calculates the shard index for a given ledger sequence
+
+        @param seq ledger sequence
+        @return The shard index of the ledger sequence
+    */
+    static
+    std::uint32_t
+    seqToShardIndex(std::uint32_t seq)
+    {
+        assert(seq >= genesisSeq);
+        return (seq - 1) / lps_;
+    }
+
+    /** Calculates the first ledger sequence for a given shard index
+
+        @param shardIndex The shard index considered
+        @return The first ledger sequence pertaining to the shard index
+    */
+    static
+    std::uint32_t
+    firstSeq(std::uint32_t shardIndex)
+    {
+        return 1 + (shardIndex * lps_);
+    }
+
+    /** Calculates the last ledger sequence for a given shard index
+
+        @param shardIndex The shard index considered
+        @return The last ledger sequence pertaining to the shard index
+    */
+    static
+    std::uint32_t
+    lastSeq(std::uint32_t shardIndex)
+    {
+        return (shardIndex + 1) * lps_;
+    }
+
+protected:
+    // The number of ledgers stored in a shard, default is 16384
+    static std::uint32_t lps_;
+};
+
+}
+}
+
+#endif
--- a/src/ripple/nodestore/Manager.h
+++ b/src/ripple/nodestore/Manager.h
@@ -22,6 +22,7 @@

 #include <ripple/nodestore/Factory.h>
 #include <ripple/nodestore/DatabaseRotating.h>
+#include <ripple/nodestore/DatabaseShard.h>

 namespace ripple {
 namespace NodeStore {
@@ -49,7 +50,9 @@ public:
        @param  name The name to match, performed case-insensitive.
        @return `nullptr` if a match was not found.
    */
-    //virtual Factory* find (std::string const& name) const = 0;
+    virtual
+    Factory*
+    find(std::string const& name) = 0;

    /** Create a backend. */
    virtual
@@ -87,15 +90,6 @@ public:
        int readThreads, Stoppable& parent,
            Section const& backendParameters,
                beast::Journal journal) = 0;
-
-    virtual
-    std::unique_ptr <DatabaseRotating>
-    make_DatabaseRotating (std::string const& name,
-        Scheduler& scheduler, std::int32_t readThreads,
-            Stoppable& parent,
-                std::shared_ptr <Backend> writableBackend,
-                    std::shared_ptr <Backend> archiveBackend,
-                        beast::Journal journal) = 0;
 };

 //------------------------------------------------------------------------------
--- a/src/ripple/nodestore/Types.h
+++ b/src/ripple/nodestore/Types.h
@@ -48,6 +48,10 @@ enum Status

 /** A batch of NodeObjects to write at once. */
 using Batch = std::vector <std::shared_ptr<NodeObject>>;
+
+// System constant/invariant
+static constexpr std::uint32_t genesisSeq {32570u};
+
 }
 }

--- a/src/ripple/nodestore/backend/MemoryFactory.cpp
+++ b/src/ripple/nodestore/backend/MemoryFactory.cpp
@@ -80,7 +80,7 @@ private:

    std::string name_;
    beast::Journal journal_;
-    MemoryDB* db_;
+    MemoryDB* db_ {nullptr};

 public:
    MemoryBackend (size_t keyBytes, Section const& keyValues,
@@ -90,7 +90,6 @@ public:
    {
        if (name_.empty())
            Throw<std::runtime_error> ("Missing path in Memory backend");
-        db_ = &memoryFactory.open(name_);
    }

    ~MemoryBackend ()
@@ -104,6 +103,12 @@ public:
        return name_;
    }

+    void
+    open() override
+    {
+        db_ = &memoryFactory.open(name_);
+    }
+
    void
    close() override
    {
@@ -115,6 +120,7 @@ public:
    Status
    fetch (void const* key, std::shared_ptr<NodeObject>* pObject) override
    {
+        assert(db_);
        uint256 const hash (uint256::fromVoid (key));

        std::lock_guard<std::mutex> _(db_->mutex);
@@ -145,6 +151,7 @@ public:
    void
    store (std::shared_ptr<NodeObject> const& object) override
    {
+        assert(db_);
        std::lock_guard<std::mutex> _(db_->mutex);
        db_->table.emplace (object->getHash(), object);
    }
@@ -159,6 +166,7 @@ public:
    void
    for_each (std::function <void(std::shared_ptr<NodeObject>)> f) override
    {
+        assert(db_);
        for (auto const& e : db_->table)
            f (e.second);
    }
--- a/src/ripple/nodestore/backend/NuDBFactory.cpp
+++ b/src/ripple/nodestore/backend/NuDBFactory.cpp
@@ -50,7 +50,7 @@ public:
        currentType = 1
    };

-    beast::Journal journal_;
+    beast::Journal j_;
    size_t const keyBytes_;
    std::string const name_;
    nudb::store db_;
@@ -59,7 +59,7 @@ public:

    NuDBBackend (int keyBytes, Section const& keyValues,
        Scheduler& scheduler, beast::Journal journal)
-        : journal_ (journal)
+        : j_(journal)
        , keyBytes_ (keyBytes)
        , name_ (get<std::string>(keyValues, "path"))
        , deletePath_(false)
@@ -68,33 +68,6 @@ public:
        if (name_.empty())
            Throw<std::runtime_error> (
                "nodestore: Missing path in NuDB backend");
-        auto const folder = boost::filesystem::path (name_);
-        boost::filesystem::create_directories (folder);
-        auto const dp = (folder / "nudb.dat").string();
-        auto const kp = (folder / "nudb.key").string ();
-        auto const lp = (folder / "nudb.log").string ();
-        try
-        {
-            nudb::error_code ec;
-            nudb::create<nudb::xxhasher>(dp, kp, lp,
-                currentType, nudb::make_salt(), keyBytes,
-                    nudb::block_size(kp), 0.50, ec);
-            if(ec == nudb::errc::file_exists)
-                ec = {};
-            if(ec)
-                Throw<nudb::system_error>(ec);
-            db_.open (dp, kp, lp, ec);
-            if(ec)
-                Throw<nudb::system_error>(ec);
-            if (db_.appnum() != currentType)
-                Throw<std::runtime_error> ("nodestore: unknown appnum");
-        }
-        catch (std::exception const& e)
-        {
-            // log and terminate?
-            std::cerr << e.what();
-            std::terminate();
-        }
    }

    ~NuDBBackend ()
@@ -108,6 +81,37 @@ public:
        return name_;
    }

+    void
+    open() override
+    {
+        if (db_.is_open())
+        {
+            assert(false);
+            JLOG(j_.error()) <<
+                "database is already open";
+            return;
+        }
+        auto const folder = boost::filesystem::path(name_);
+        boost::filesystem::create_directories (folder);
+        auto const dp = (folder / "nudb.dat").string();
+        auto const kp = (folder / "nudb.key").string();
+        auto const lp = (folder / "nudb.log").string();
+        nudb::error_code ec;
+        nudb::create<nudb::xxhasher>(dp, kp, lp,
+            currentType, nudb::make_salt(), keyBytes_,
+                nudb::block_size(kp), 0.50, ec);
+        if(ec == nudb::errc::file_exists)
+            ec = {};
+        if(ec)
+            Throw<nudb::system_error>(ec);
+        db_.open (dp, kp, lp, ec);
+        if(ec)
+            Throw<nudb::system_error>(ec);
+        if (db_.appnum() != currentType)
+            Throw<std::runtime_error>(
+                "nodestore: unknown appnum");
+    }
+
    void
    close() override
    {
@@ -197,7 +201,6 @@ public:
    storeBatch (Batch const& batch) override
    {
        BatchWriteReport report;
-        EncodedBlob encoded;
        report.writeCount = batch.size();
        auto const start =
            std::chrono::steady_clock::now();
--- a/src/ripple/nodestore/backend/NullFactory.cpp
+++ b/src/ripple/nodestore/backend/NullFactory.cpp
@@ -43,6 +43,11 @@ public:
        return std::string ();
    }

+    void
+    open() override
+    {
+    }
+
    void
    close() override
    {
--- a/src/ripple/nodestore/backend/RocksDBFactory.cpp
+++ b/src/ripple/nodestore/backend/RocksDBFactory.cpp
@@ -100,6 +100,7 @@ public:
    std::string m_name;
    std::unique_ptr <rocksdb::DB> m_db;
    int fdlimit_ = 2048;
+    rocksdb::Options m_options;

    RocksDBBackend (int keyBytes, Section const& keyValues,
        Scheduler& scheduler, beast::Journal journal, RocksDBEnv* env)
@@ -112,10 +113,9 @@ public:
        if (! get_if_exists(keyValues, "path", m_name))
            Throw<std::runtime_error> ("Missing path in RocksDBFactory backend");

-        rocksdb::Options options;
        rocksdb::BlockBasedTableOptions table_options;
-        options.create_if_missing = true;
-        options.env = env;
+        m_options.create_if_missing = true;
+        m_options.env = env;

        if (keyValues.exists ("cache_mb"))
            table_options.block_cache = rocksdb::NewLRUCache (
@@ -128,39 +128,39 @@ public:
            table_options.filter_policy.reset (rocksdb::NewBloomFilterPolicy (v, filter_blocks));
        }

-        if (get_if_exists (keyValues, "open_files", options.max_open_files))
-            fdlimit_ = options.max_open_files;
+        if (get_if_exists (keyValues, "open_files", m_options.max_open_files))
+            fdlimit_ = m_options.max_open_files;

        if (keyValues.exists ("file_size_mb"))
        {
-            options.target_file_size_base = 1024 * 1024 * get<int>(keyValues,"file_size_mb");
-            options.max_bytes_for_level_base = 5 * options.target_file_size_base;
-            options.write_buffer_size = 2 * options.target_file_size_base;
+            m_options.target_file_size_base = 1024 * 1024 * get<int>(keyValues,"file_size_mb");
+            m_options.max_bytes_for_level_base = 5 * m_options.target_file_size_base;
+            m_options.write_buffer_size = 2 * m_options.target_file_size_base;
        }

-        get_if_exists (keyValues, "file_size_mult", options.target_file_size_multiplier);
+        get_if_exists (keyValues, "file_size_mult", m_options.target_file_size_multiplier);

        if (keyValues.exists ("bg_threads"))
        {
-            options.env->SetBackgroundThreads
+            m_options.env->SetBackgroundThreads
                (get<int>(keyValues, "bg_threads"), rocksdb::Env::LOW);
        }

        if (keyValues.exists ("high_threads"))
        {
            auto const highThreads = get<int>(keyValues, "high_threads");
-            options.env->SetBackgroundThreads (highThreads, rocksdb::Env::HIGH);
+            m_options.env->SetBackgroundThreads (highThreads, rocksdb::Env::HIGH);

            // If we have high-priority threads, presumably we want to
            // use them for background flushes
            if (highThreads > 0)
-                options.max_background_flushes = highThreads;
+                m_options.max_background_flushes = highThreads;
        }

        if (keyValues.exists ("compression") &&
            (get<int>(keyValues, "compression") == 0))
        {
-            options.compression = rocksdb::kNoCompression;
+            m_options.compression = rocksdb::kNoCompression;
        }

        get_if_exists (keyValues, "block_size", table_options.block_size);
@@ -168,10 +168,10 @@ public:
        if (keyValues.exists ("universal_compaction") &&
            (get<int>(keyValues, "universal_compaction") != 0))
        {
-            options.compaction_style = rocksdb::kCompactionStyleUniversal;
-            options.min_write_buffer_number_to_merge = 2;
-            options.max_write_buffer_number = 6;
-            options.write_buffer_size = 6 * options.target_file_size_base;
+            m_options.compaction_style = rocksdb::kCompactionStyleUniversal;
+            m_options.min_write_buffer_number_to_merge = 2;
+            m_options.max_write_buffer_number = 6;
+            m_options.write_buffer_size = 6 * m_options.target_file_size_base;
        }

        if (keyValues.exists("bbt_options"))
@@ -185,28 +185,20 @@ public:
                    std::string("Unable to set RocksDB bbt_options: ") + s.ToString());
        }

-        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        m_options.table_factory.reset(NewBlockBasedTableFactory(table_options));

        if (keyValues.exists("options"))
        {
            auto const s = rocksdb::GetOptionsFromString(
-                options, get<std::string>(keyValues, "options"), &options);
+                m_options, get<std::string>(keyValues, "options"), &m_options);
            if (! s.ok())
                Throw<std::runtime_error> (
                    std::string("Unable to set RocksDB options: ") + s.ToString());
        }

-        rocksdb::DB* db = nullptr;
-        rocksdb::Status status = rocksdb::DB::Open (options, m_name, &db);
-        if (! status.ok () || ! db)
-            Throw<std::runtime_error> (
-                std::string("Unable to open/create RocksDB: ") + status.ToString());
-
-        m_db.reset (db);
-
        std::string s1, s2;
-        rocksdb::GetStringFromDBOptions(&s1, options, "; ");
-        rocksdb::GetStringFromColumnFamilyOptions(&s2, options, "; ");
+        rocksdb::GetStringFromDBOptions(&s1, m_options, "; ");
+        rocksdb::GetStringFromColumnFamilyOptions(&s2, m_options, "; ");
        JLOG(m_journal.debug()) << "RocksDB DBOptions: " << s1;
        JLOG(m_journal.debug()) << "RocksDB CFOptions: " << s2;
    }
@@ -216,6 +208,25 @@ public:
        close();
    }

+    void
+    open() override
+    {
+        if (m_db)
+        {
+            assert(false);
+            JLOG(m_journal.error()) <<
+                "database is already open";
+            return;
+        }
+        rocksdb::DB* db = nullptr;
+        rocksdb::Status status = rocksdb::DB::Open(m_options, m_name, &db);
+        if (!status.ok() || !db)
+            Throw<std::runtime_error>(
+                std::string("Unable to open/create RocksDB: ") +
+                status.ToString());
+        m_db.reset(db);
+    }
+
    void
    close() override
    {
@@ -241,6 +252,7 @@ public:
    Status
    fetch (void const* key, std::shared_ptr<NodeObject>* pObject) override
    {
+        assert(m_db);
        pObject->reset ();

        Status status (ok);
@@ -310,6 +322,7 @@ public:
    void
    storeBatch (Batch const& batch) override
    {
+        assert(m_db);
        rocksdb::WriteBatch wb;

        EncodedBlob encoded;
@@ -336,6 +349,7 @@ public:
    void
    for_each (std::function <void(std::shared_ptr<NodeObject>)> f) override
    {
+        assert(m_db);
        rocksdb::ReadOptions const options;

        std::unique_ptr <rocksdb::Iterator> it (m_db->NewIterator (options));
--- a/src/ripple/nodestore/backend/RocksDBQuickFactory.cpp
+++ b/src/ripple/nodestore/backend/RocksDBQuickFactory.cpp
@@ -96,6 +96,7 @@ public:
    std::string m_name;
    std::unique_ptr <rocksdb::DB> m_db;
    int fdlimit_ = 2048;
+    rocksdb::Options m_options;

    RocksDBQuickBackend (int keyBytes, Section const& keyValues,
        Scheduler& scheduler, beast::Journal journal, RocksDBQuickEnv* env)
@@ -118,26 +119,25 @@ public:
        get_if_exists (keyValues, "threads", threads);

        // Set options
-        rocksdb::Options options;
-        options.create_if_missing = true;
-        options.env = env;
+        m_options.create_if_missing = true;
+        m_options.env = env;

        if (style == "level")
-            options.OptimizeLevelStyleCompaction(budget);
+            m_options.OptimizeLevelStyleCompaction(budget);

        if (style == "universal")
-            options.OptimizeUniversalStyleCompaction(budget);
+            m_options.OptimizeUniversalStyleCompaction(budget);

        if (style == "point")
-            options.OptimizeForPointLookup(budget / 1024 / 1024);  // In MB
+            m_options.OptimizeForPointLookup(budget / 1024 / 1024);  // In MB

-        options.IncreaseParallelism(threads);
+        m_options.IncreaseParallelism(threads);

        // Allows hash indexes in blocks
-        options.prefix_extractor.reset(rocksdb::NewNoopTransform());
+        m_options.prefix_extractor.reset(rocksdb::NewNoopTransform());

        // overrride OptimizeLevelStyleCompaction
-        options.min_write_buffer_number_to_merge = 1;
+        m_options.min_write_buffer_number_to_merge = 1;

        rocksdb::BlockBasedTableOptions table_options;
        // Use hash index
@@ -145,7 +145,7 @@ public:
            rocksdb::BlockBasedTableOptions::kHashSearch;
        table_options.filter_policy.reset(
            rocksdb::NewBloomFilterPolicy(10));
-        options.table_factory.reset(
+        m_options.table_factory.reset(
            NewBlockBasedTableFactory(table_options));

        // Higher values make reads slower
@@ -155,27 +155,17 @@ public:
        // table_options.block_cache =
        //     rocksdb::NewLRUCache(64 * 1024 * 1024);

-        options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory());
+        m_options.memtable_factory.reset(rocksdb::NewHashSkipListRepFactory());
        // Alternative:
-        // options.memtable_factory.reset(
-        //     rocksdb::NewHashCuckooRepFactory(options.write_buffer_size));
+        // m_options.memtable_factory.reset(
+        //     rocksdb::NewHashCuckooRepFactory(m_options.write_buffer_size));

-        if (get_if_exists (keyValues, "open_files", options.max_open_files))
-            fdlimit_ = options.max_open_files;
+        if (get_if_exists (keyValues, "open_files", m_options.max_open_files))
+            fdlimit_ = m_options.max_open_files;

        if (keyValues.exists ("compression") &&
            (get<int>(keyValues, "compression") == 0))
-            options.compression = rocksdb::kNoCompression;
-
-        rocksdb::DB* db = nullptr;
-
-        rocksdb::Status status = rocksdb::DB::Open (options, m_name, &db);
-        if (! status.ok () || ! db)
-            Throw<std::runtime_error> (
-                std::string("Unable to open/create RocksDBQuick: ") +
-                    status.ToString());
-
-        m_db.reset (db);
+            m_options.compression = rocksdb::kNoCompression;
    }

    ~RocksDBQuickBackend ()
@@ -189,6 +179,25 @@ public:
        return m_name;
    }

+    void
+    open() override
+    {
+        if (m_db)
+        {
+            assert(false);
+            JLOG(m_journal.error()) <<
+                "database is already open";
+            return;
+        }
+        rocksdb::DB* db = nullptr;
+        rocksdb::Status status = rocksdb::DB::Open(m_options, m_name, &db);
+        if (!status.ok() || !db)
+            Throw<std::runtime_error>(
+                std::string("Unable to open/create RocksDB: ") +
+                status.ToString());
+        m_db.reset(db);
+    }
+
    void
    close() override
    {
@@ -208,6 +217,7 @@ public:
    Status
    fetch (void const* key, std::shared_ptr<NodeObject>* pObject) override
    {
+        assert(m_db);
        pObject->reset ();

        Status status (ok);
@@ -277,6 +287,7 @@ public:
    void
    storeBatch (Batch const& batch) override
    {
+        assert(m_db);
        rocksdb::WriteBatch wb;

        EncodedBlob encoded;
@@ -306,6 +317,7 @@ public:
    void
    for_each (std::function <void(std::shared_ptr<NodeObject>)> f) override
    {
+        assert(m_db);
        rocksdb::ReadOptions const options;

        std::unique_ptr <rocksdb::Iterator> it (m_db->NewIterator (options));
--- a/src/ripple/nodestore/impl/Database.cpp
+++ b/src/ripple/nodestore/impl/Database.cpp
@@ -0,0 +1,261 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2013 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#include <ripple/nodestore/Database.h>
+#include <ripple/basics/chrono.h>
+#include <ripple/beast/core/CurrentThreadName.h>
+#include <ripple/protocol/HashPrefix.h>
+
+namespace ripple {
+namespace NodeStore {
+
+Database::Database(std::string name, Stoppable& parent,
+    Scheduler& scheduler, int readThreads, beast::Journal journal)
+    : Stoppable(name, parent)
+    , j_(journal)
+    , scheduler_(scheduler)
+{
+    while (readThreads-- > 0)
+        readThreads_.emplace_back(&Database::threadEntry, this);
+}
+
+Database::~Database()
+{
+    // NOTE!
+    // Any derived class should call the stopThreads() method in its
+    // destructor.  Otherwise, occasionally, the derived class may
+    // crash during shutdown when its members are accessed by one of
+    // these threads after the derived class is destroyed but before
+    // this base class is destroyed.
+    stopThreads();
+}
+
+void
+Database::waitReads()
+{
+    std::unique_lock<std::mutex> l(readLock_);
+    // Wake in two generations.
+    // Each generation is a full pass over the space.
+    // If we're in generation N and you issue a request,
+    // that request will only be done during generation N
+    // if it happens to land after where the pass currently is.
+    // But, if not, it will definitely be done during generation
+    // N+1 since the request was in the table before that pass
+    // even started. So when you reach generation N+2,
+    // you know the request is done.
+    std::uint64_t const wakeGen = readGen_ + 2;
+    while (! readShut_ && ! read_.empty() && (readGen_ < wakeGen))
+        readGenCondVar_.wait(l);
+}
+
+void
+Database::onStop()
+{
+    // After stop time we can no longer use the JobQueue for background
+    // reads.  Join the background read threads.
+    stopThreads();
+    stopped();
+}
+
+void
+Database::stopThreads()
+{
+    {
+        std::lock_guard <std::mutex> l(readLock_);
+        if (readShut_) // Only stop threads once.
+            return;
+
+        readShut_ = true;
+        readCondVar_.notify_all();
+        readGenCondVar_.notify_all();
+    }
+
+    for (auto& e : readThreads_)
+        e.join();
+}
+
+void
+Database::asyncFetch(uint256 const& hash, std::uint32_t seq,
+    std::shared_ptr<TaggedCache<uint256, NodeObject>> const& pCache,
+        std::shared_ptr<KeyCache<uint256>> const& nCache)
+{
+    // Post a read
+    std::lock_guard <std::mutex> l(readLock_);
+    if (read_.emplace(hash, std::make_tuple(seq, pCache, nCache)).second)
+        readCondVar_.notify_one();
+}
+
+std::shared_ptr<NodeObject>
+Database::fetchInternal(uint256 const& hash, Backend& backend)
+{
+    std::shared_ptr<NodeObject> nObj;
+    Status status;
+    try
+    {
+        status = backend.fetch(hash.begin(), &nObj);
+    }
+    catch (std::exception const& e)
+    {
+        JLOG(j_.fatal()) <<
+            "Exception, " << e.what();
+        Rethrow();
+    }
+
+    switch(status)
+    {
+    case ok:
+        ++fetchHitCount_;
+        if (nObj)
+            fetchSz_ += nObj->getData().size();
+        break;
+    case notFound:
+        break;
+    case dataCorrupt:
+        // VFALCO TODO Deal with encountering corrupt data!
+        JLOG(j_.fatal()) <<
+            "Corrupt NodeObject #" << hash;
+        break;
+    default:
+        JLOG(j_.warn()) <<
+            "Unknown status=" << status;
+        break;
+    }
+    return nObj;
+}
+
+void
+Database::importInternal(Database& source, Backend& dest)
+{
+    Batch b;
+    b.reserve(batchWritePreallocationSize);
+    source.for_each(
+        [&](std::shared_ptr<NodeObject> nObj)
+        {
+            assert(nObj);
+            if (! nObj) // This should never happen
+                return;
+
+            ++storeCount_;
+            storeSz_ += nObj->getData().size();
+
+            b.push_back(nObj);
+            if (b.size() >= batchWritePreallocationSize)
+            {
+                dest.storeBatch(b);
+                b.clear();
+                b.reserve(batchWritePreallocationSize);
+            }
+        });
+    if (! b.empty())
+        dest.storeBatch(b);
+}
+
+// Perform a fetch and report the time it took
+std::shared_ptr<NodeObject>
+Database::doFetch(uint256 const& hash, std::uint32_t seq,
+    std::shared_ptr<TaggedCache<uint256, NodeObject>> const& pCache,
+        std::shared_ptr<KeyCache<uint256>> const& nCache, bool isAsync)
+{
+    FetchReport report;
+    report.isAsync = isAsync;
+    report.wentToDisk = false;
+
+    using namespace std::chrono;
+    auto const before = steady_clock::now();
+
+    // See if the object already exists in the cache
+    auto nObj = pCache->fetch(hash);
+    if (! nObj && ! nCache->touch_if_exists(hash))
+    {
+        // Try the database(s)
+        report.wentToDisk = true;
+        nObj = fetchFrom(hash, seq);
+        ++fetchTotalCount_;
+        if (! nObj)
+        {
+            // Just in case a write occurred
+            nObj = pCache->fetch(hash);
+            if (! nObj)
+                // We give up
+                nCache->insert(hash);
+        }
+        else
+        {
+            // Ensure all threads get the same object
+            pCache->canonicalize(hash, nObj);
+
+            // Since this was a 'hard' fetch, we will log it.
+            JLOG(j_.trace()) <<
+                "HOS: " << hash << " fetch: in db";
+        }
+    }
+    report.wasFound = static_cast<bool>(nObj);
+    report.elapsed = duration_cast<milliseconds>(
+        steady_clock::now() - before);
+    scheduler_.onFetch(report);
+    return nObj;
+}
+
+// Entry point for async read threads
+void
+Database::threadEntry()
+{
+    beast::setCurrentThreadName("prefetch");
+    while (true)
+    {
+        uint256 lastHash;
+        std::uint32_t lastSeq;
+        std::shared_ptr<TaggedCache<uint256, NodeObject>> lastPcache;
+        std::shared_ptr<KeyCache<uint256>> lastNcache;
+        {
+            std::unique_lock<std::mutex> l(readLock_);
+            while (! readShut_ && read_.empty())
+            {
+                // All work is done
+                readGenCondVar_.notify_all();
+                readCondVar_.wait(l);
+            }
+            if (readShut_)
+                break;
+
+            // Read in key order to make the back end more efficient
+            auto it = read_.lower_bound(readLastHash_);
+            if (it == read_.end())
+            {
+                it = read_.begin();
+                // A generation has completed
+                ++readGen_;
+                readGenCondVar_.notify_all();
+            }
+            lastHash = it->first;
+            lastSeq = std::get<0>(it->second);
+            lastPcache = std::get<1>(it->second).lock();
+            lastNcache = std::get<2>(it->second).lock();
+            read_.erase(it);
+            readLastHash_ = lastHash;
+        }
+
+        // Perform the read
+        if (lastPcache && lastPcache)
+            doFetch(lastHash, lastSeq, lastPcache, lastNcache, true);
+    }
+}
+
+} // NodeStore
+} // ripple
--- a/src/ripple/nodestore/impl/DatabaseImp.h
+++ b/src/ripple/nodestore/impl/DatabaseImp.h
@@ -1,465 +0,0 @@
-//------------------------------------------------------------------------------
-/*
-    This file is part of rippled: https://github.com/ripple/rippled
-    Copyright (c) 2012, 2013 Ripple Labs Inc.
-
-    Permission to use, copy, modify, and/or distribute this software for any
-    purpose  with  or without fee is hereby granted, provided that the above
-    copyright notice and this permission notice appear in all copies.
-
-    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
-    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
-    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-*/
-//==============================================================================
-
-#ifndef RIPPLE_NODESTORE_DATABASEIMP_H_INCLUDED
-#define RIPPLE_NODESTORE_DATABASEIMP_H_INCLUDED
-
-#include <ripple/nodestore/Database.h>
-#include <ripple/nodestore/Scheduler.h>
-#include <ripple/nodestore/impl/Tuning.h>
-#include <ripple/basics/KeyCache.h>
-#include <ripple/basics/chrono.h>
-#include <ripple/beast/core/CurrentThreadName.h>
-
-namespace ripple {
-namespace NodeStore {
-
-class DatabaseImp
-    : public Database
-{
-private:
-    beast::Journal m_journal;
-    Scheduler& m_scheduler;
-    // Persistent key/value storage.
-    std::unique_ptr <Backend> m_backend;
-protected:
-    // Positive cache
-    TaggedCache <uint256, NodeObject> m_cache;
-
-    // Negative cache
-    KeyCache <uint256> m_negCache;
-private:
-    std::mutex                m_readLock;
-    std::condition_variable   m_readCondVar;
-    std::condition_variable   m_readGenCondVar;
-    std::set <uint256>        m_readSet;        // set of reads to do
-    uint256                   m_readLast;       // last hash read
-    std::vector <std::thread> m_readThreads;
-    bool                      m_readShut;
-    uint64_t                  m_readGen;        // current read generation
-    int                       fdlimit_;
-    std::atomic <std::uint32_t> m_storeCount;
-    std::atomic <std::uint32_t> m_fetchTotalCount;
-    std::atomic <std::uint32_t> m_fetchHitCount;
-    std::atomic <std::uint32_t> m_storeSize;
-    std::atomic <std::uint32_t> m_fetchSize;
-
-public:
-    DatabaseImp (std::string const& name,
-                 Scheduler& scheduler,
-                 int readThreads,
-                 Stoppable& parent,
-                 std::unique_ptr <Backend> backend,
-                 beast::Journal journal)
-        : Database (name, parent)
-        , m_journal (journal)
-        , m_scheduler (scheduler)
-        , m_backend (std::move (backend))
-        , m_cache ("NodeStore", cacheTargetSize, cacheTargetSeconds,
-            stopwatch(), journal)
-        , m_negCache ("NodeStore", stopwatch(),
-            cacheTargetSize, cacheTargetSeconds)
-        , m_readShut (false)
-        , m_readGen (0)
-        , fdlimit_ (0)
-        , m_storeCount (0)
-        , m_fetchTotalCount (0)
-        , m_fetchHitCount (0)
-        , m_storeSize (0)
-        , m_fetchSize (0)
-    {
-        for (int i = 0; i < readThreads; ++i)
-            m_readThreads.emplace_back (&DatabaseImp::threadEntry, this);
-
-        if (m_backend)
-            fdlimit_ = m_backend->fdlimit();
-    }
-
-    ~DatabaseImp () override
-    {
-        // NOTE!
-        // Any derived class should call the stopThreads() method in its
-        // destructor.  Otherwise, occasionally, the derived class may
-        // crash during shutdown when its members are accessed by one of
-        // these threads after the derived class is destroyed but before
-        // this base class is destroyed.
-        stopThreads();
-    }
-
-    std::string
-    getName () const override
-    {
-        return m_backend->getName ();
-    }
-
-    //------------------------------------------------------------------------------
-
-    bool asyncFetch (uint256 const& hash, std::shared_ptr<NodeObject>& object) override
-    {
-        // See if the object is in cache
-        object = m_cache.fetch (hash);
-        if (object || m_negCache.touch_if_exists (hash))
-            return true;
-
-        {
-            // No. Post a read
-            std::lock_guard <std::mutex> lock (m_readLock);
-            if (m_readSet.insert (hash).second)
-                m_readCondVar.notify_one ();
-        }
-
-        return false;
-    }
-
-    void waitReads() override
-    {
-        {
-            std::unique_lock <std::mutex> lock (m_readLock);
-
-            // Wake in two generations
-            std::uint64_t const wakeGeneration = m_readGen + 2;
-
-            while (!m_readShut && !m_readSet.empty () && (m_readGen < wakeGeneration))
-                m_readGenCondVar.wait (lock);
-        }
-
-    }
-
-    int getDesiredAsyncReadCount () override
-    {
-        // We prefer a client not fill our cache
-        // We don't want to push data out of the cache
-        // before it's retrieved
-        return m_cache.getTargetSize() / asyncDivider;
-    }
-
-    std::shared_ptr<NodeObject> fetch (uint256 const& hash) override
-    {
-        return doTimedFetch (hash, false);
-    }
-
-    /** Perform a fetch and report the time it took */
-    std::shared_ptr<NodeObject> doTimedFetch (uint256 const& hash, bool isAsync)
-    {
-        FetchReport report;
-        report.isAsync = isAsync;
-        report.wentToDisk = false;
-
-        auto const before = std::chrono::steady_clock::now();
-        std::shared_ptr<NodeObject> ret = doFetch (hash, report);
-        report.elapsed = std::chrono::duration_cast <std::chrono::milliseconds>
-            (std::chrono::steady_clock::now() - before);
-
-        report.wasFound = (ret != nullptr);
-        m_scheduler.onFetch (report);
-
-        return ret;
-    }
-
-    std::shared_ptr<NodeObject> doFetch (uint256 const& hash, FetchReport &report)
-    {
-        // See if the object already exists in the cache
-        //
-        std::shared_ptr<NodeObject> obj = m_cache.fetch (hash);
-
-        if (obj != nullptr)
-            return obj;
-
-        if (m_negCache.touch_if_exists (hash))
-            return obj;
-
-        // Check the database(s).
-
-        report.wentToDisk = true;
-
-        // Are we still without an object?
-        //
-        if (obj == nullptr)
-        {
-            // Yes so at last we will try the main database.
-            //
-            obj = fetchFrom (hash);
-            ++m_fetchTotalCount;
-        }
-
-        if (obj == nullptr)
-        {
-
-            // Just in case a write occurred
-            obj = m_cache.fetch (hash);
-
-            if (obj == nullptr)
-            {
-                // We give up
-                m_negCache.insert (hash);
-            }
-        }
-        else
-        {
-            // Ensure all threads get the same object
-            //
-            m_cache.canonicalize (hash, obj);
-
-            // Since this was a 'hard' fetch, we will log it.
-            //
-            JLOG(m_journal.trace()) <<
-                "HOS: " << hash << " fetch: in db";
-        }
-
-        return obj;
-    }
-
-    virtual std::shared_ptr<NodeObject> fetchFrom (uint256 const& hash)
-    {
-        return fetchInternal (*m_backend, hash);
-    }
-
-    std::shared_ptr<NodeObject> fetchInternal (Backend& backend,
-        uint256 const& hash)
-    {
-        std::shared_ptr<NodeObject> object;
-
-        Status const status = backend.fetch (hash.begin (), &object);
-
-        switch (status)
-        {
-        case ok:
-            ++m_fetchHitCount;
-            if (object)
-                m_fetchSize += object->getData().size();
-        case notFound:
-            break;
-
-        case dataCorrupt:
-            // VFALCO TODO Deal with encountering corrupt data!
-            //
-            JLOG(m_journal.fatal()) <<
-                "Corrupt NodeObject #" << hash;
-            break;
-
-        default:
-            JLOG(m_journal.warn()) <<
-                "Unknown status=" << status;
-            break;
-        }
-
-        return object;
-    }
-
-    //------------------------------------------------------------------------------
-
-    void store (NodeObjectType type,
-                Blob&& data,
-                uint256 const& hash) override
-    {
-        storeInternal (type, std::move(data), hash, *m_backend.get());
-    }
-
-    void storeInternal (NodeObjectType type,
-                        Blob&& data,
-                        uint256 const& hash,
-                        Backend& backend)
-    {
-        #if RIPPLE_VERIFY_NODEOBJECT_KEYS
-        assert (hash == sha512Hash(makeSlice(data)));
-        #endif
-
-        std::shared_ptr<NodeObject> object = NodeObject::createObject(
-            type, std::move(data), hash);
-
-        m_cache.canonicalize (hash, object, true);
-
-        backend.store (object);
-        ++m_storeCount;
-        if (object)
-            m_storeSize += object->getData().size();
-
-        m_negCache.erase (hash);
-    }
-
-    //------------------------------------------------------------------------------
-
-    float getCacheHitRate () override
-    {
-        return m_cache.getHitRate ();
-    }
-
-    void tune (int size, int age) override
-    {
-        m_cache.setTargetSize (size);
-        m_cache.setTargetAge (age);
-        m_negCache.setTargetSize (size);
-        m_negCache.setTargetAge (age);
-    }
-
-    void sweep () override
-    {
-        m_cache.sweep ();
-        m_negCache.sweep ();
-    }
-
-    std::int32_t getWriteLoad() const override
-    {
-        return m_backend->getWriteLoad();
-    }
-
-    //------------------------------------------------------------------------------
-
-    // Entry point for async read threads
-    void threadEntry ()
-    {
-        beast::setCurrentThreadName ("prefetch");
-        while (1)
-        {
-            uint256 hash;
-
-            {
-                std::unique_lock <std::mutex> lock (m_readLock);
-
-                while (!m_readShut && m_readSet.empty ())
-                {
-                    // all work is done
-                    m_readGenCondVar.notify_all ();
-                    m_readCondVar.wait (lock);
-                }
-
-                if (m_readShut)
-                    break;
-
-                // Read in key order to make the back end more efficient
-                std::set <uint256>::iterator it = m_readSet.lower_bound (m_readLast);
-                if (it == m_readSet.end ())
-                {
-                    it = m_readSet.begin ();
-
-                    // A generation has completed
-                    ++m_readGen;
-                    m_readGenCondVar.notify_all ();
-                }
-
-                hash = *it;
-                m_readSet.erase (it);
-                m_readLast = hash;
-            }
-
-            // Perform the read
-            doTimedFetch (hash, true);
-         }
-     }
-
-    //------------------------------------------------------------------------------
-
-    void for_each (std::function <void(std::shared_ptr<NodeObject>)> f) override
-    {
-        m_backend->for_each (f);
-    }
-
-    void import (Database& source) override
-    {
-        importInternal (source, *m_backend.get());
-    }
-
-    void importInternal (Database& source, Backend& dest)
-    {
-        Batch b;
-        b.reserve (batchWritePreallocationSize);
-
-        source.for_each ([&](std::shared_ptr<NodeObject> object)
-        {
-            if (b.size() >= batchWritePreallocationSize)
-            {
-                dest.storeBatch (b);
-                b.clear();
-                b.reserve (batchWritePreallocationSize);
-            }
-
-            b.push_back (object);
-            ++m_storeCount;
-            if (object)
-                m_storeSize += object->getData().size();
-        });
-
-        if (! b.empty())
-            dest.storeBatch (b);
-    }
-
-    std::uint32_t getStoreCount () const override
-    {
-        return m_storeCount;
-    }
-
-    std::uint32_t getFetchTotalCount () const override
-    {
-        return m_fetchTotalCount;
-    }
-
-    std::uint32_t getFetchHitCount () const override
-    {
-        return m_fetchHitCount;
-    }
-
-    std::uint32_t getStoreSize () const override
-    {
-        return m_storeSize;
-    }
-
-    std::uint32_t getFetchSize () const override
-    {
-        return m_fetchSize;
-    }
-
-    int fdlimit() const override
-    {
-        return fdlimit_;
-    }
-
-    //--------------------------------------------------------------------------
-    //
-    // Stoppable.
-
-    void onStop () override
-    {
-        // After stop time we can no longer use the JobQueue for background
-        // reads.  Join the background read threads.
-        DatabaseImp::stopThreads();
-        stopped();
-    }
-
-protected:
-    void stopThreads ()
-    {
-        {
-            std::lock_guard <std::mutex> lock (m_readLock);
-            if (m_readShut) // Only stop threads once.
-                return;
-
-            m_readShut = true;
-            m_readCondVar.notify_all ();
-            m_readGenCondVar.notify_all ();
-        }
-
-        for (auto& e : m_readThreads)
-            e.join();
-    }
-};
-
-}
-}
-
-#endif
--- a/src/ripple/nodestore/impl/DatabaseNodeImp.cpp
+++ b/src/ripple/nodestore/impl/DatabaseNodeImp.cpp
@@ -0,0 +1,151 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2013 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#include <BeastConfig.h>
+#include <ripple/nodestore/impl/DatabaseNodeImp.h>
+#include <ripple/app/ledger/Ledger.h>
+#include <ripple/protocol/HashPrefix.h>
+
+namespace ripple {
+namespace NodeStore {
+
+void
+DatabaseNodeImp::store(NodeObjectType type, Blob&& data,
+    uint256 const& hash, std::uint32_t seq)
+{
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+    assert(hash == sha512Hash(makeSlice(data)));
+#endif
+    auto nObj = NodeObject::createObject(type, std::move(data), hash);
+    pCache_->canonicalize(hash, nObj, true);
+    backend_->store(nObj);
+    nCache_->erase(hash);
+    storeStats(nObj->getData().size());
+}
+
+bool
+DatabaseNodeImp::asyncFetch(uint256 const& hash,
+    std::uint32_t seq, std::shared_ptr<NodeObject>& object)
+{
+    // See if the object is in cache
+    object = pCache_->fetch(hash);
+    if (object || nCache_->touch_if_exists(hash))
+        return true;
+    // Otherwise post a read
+    Database::asyncFetch(hash, seq, pCache_, nCache_);
+    return false;
+}
+
+bool
+DatabaseNodeImp::copyLedger(
+    std::shared_ptr<Ledger const> const& ledger)
+{
+    if (ledger->info().hash.isZero() ||
+        ledger->info().accountHash.isZero())
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "Invalid ledger";
+        return false;
+    }
+    auto& srcDB = const_cast<Database&>(
+        ledger->stateMap().family().db());
+    if (&srcDB == this)
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "Source and destination are the same";
+        return false;
+    }
+    Batch batch;
+    bool error = false;
+    auto f = [&](SHAMapAbstractNode& node) {
+        if (auto nObj = srcDB.fetch(
+            node.getNodeHash().as_uint256(), ledger->info().seq))
+                batch.emplace_back(std::move(nObj));
+        else
+            error = true;
+        return !error;
+    };
+    // Batch the ledger header
+    {
+        Serializer s(1024);
+        s.add32(HashPrefix::ledgerMaster);
+        addRaw(ledger->info(), s);
+        batch.emplace_back(NodeObject::createObject(hotLEDGER,
+            std::move(s.modData()), ledger->info().hash));
+    }
+    // Batch the state map
+    if (ledger->stateMap().getHash().isNonZero())
+    {
+        if (! ledger->stateMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid state map";
+            return false;
+        }
+        ledger->stateMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+    // Batch the transaction map
+    if (ledger->info().txHash.isNonZero())
+    {
+        if (! ledger->txMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid transaction map";
+            return false;
+        }
+        ledger->txMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+    // Store batch
+    for (auto& nObj : batch)
+    {
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+        assert(nObj->getHash() == sha512Hash(makeSlice(nObj->getData())));
+#endif
+        pCache_->canonicalize(nObj->getHash(), nObj, true);
+        nCache_->erase(nObj->getHash());
+        storeStats(nObj->getData().size());
+    }
+    backend_->storeBatch(batch);
+    return true;
+}
+
+void
+DatabaseNodeImp::tune(int size, int age)
+{
+    pCache_->setTargetSize(size);
+    pCache_->setTargetAge(age);
+    nCache_->setTargetSize(size);
+    nCache_->setTargetAge(age);
+}
+
+void
+DatabaseNodeImp::sweep()
+{
+    pCache_->sweep();
+    nCache_->sweep();
+}
+
+} // NodeStore
+} // ripple
--- a/src/ripple/nodestore/impl/DatabaseNodeImp.h
+++ b/src/ripple/nodestore/impl/DatabaseNodeImp.h
@@ -0,0 +1,134 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2013 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#ifndef RIPPLE_NODESTORE_DATABASENODEIMP_H_INCLUDED
+#define RIPPLE_NODESTORE_DATABASENODEIMP_H_INCLUDED
+
+#include <ripple/nodestore/Database.h>
+#include <ripple/basics/chrono.h>
+
+namespace ripple {
+namespace NodeStore {
+
+class DatabaseNodeImp : public Database
+{
+public:
+    DatabaseNodeImp() = delete;
+    DatabaseNodeImp(DatabaseNodeImp const&) = delete;
+    DatabaseNodeImp& operator=(DatabaseNodeImp const&) = delete;
+
+    DatabaseNodeImp(std::string const& name,
+        Scheduler& scheduler, int readThreads, Stoppable& parent,
+            std::unique_ptr<Backend> backend, beast::Journal j)
+        : Database(name, parent, scheduler, readThreads, j)
+        , pCache_(std::make_shared<TaggedCache<uint256, NodeObject>>(
+            name, cacheTargetSize, cacheTargetSeconds, stopwatch(), j))
+        , nCache_(std::make_shared<KeyCache<uint256>>(
+            name, stopwatch(), cacheTargetSize, cacheTargetSeconds))
+        , backend_(std::move(backend))
+    {
+        assert(backend_);
+    }
+
+    ~DatabaseNodeImp() override
+    {
+        // Stop threads before data members are destroyed.
+        stopThreads();
+    }
+
+    std::string
+    getName() const override
+    {
+        return backend_->getName();
+    }
+
+    std::int32_t
+    getWriteLoad() const override
+    {
+        return backend_->getWriteLoad();
+    }
+
+    void
+    import(Database& source) override
+    {
+        importInternal(source, *backend_.get());
+    }
+
+    void
+    store(NodeObjectType type, Blob&& data,
+        uint256 const& hash, std::uint32_t seq) override;
+
+    std::shared_ptr<NodeObject>
+    fetch(uint256 const& hash, std::uint32_t seq) override
+    {
+        return doFetch(hash, seq, pCache_, nCache_, false);
+    }
+
+    bool
+    asyncFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<NodeObject>& object) override;
+
+    bool
+    copyLedger(std::shared_ptr<Ledger const> const& ledger) override;
+
+    int
+    getDesiredAsyncReadCount(std::uint32_t seq) override
+    {
+        // We prefer a client not fill our cache
+        // We don't want to push data out of the cache
+        // before it's retrieved
+        return pCache_->getTargetSize() / asyncDivider;
+    }
+
+    float
+    getCacheHitRate() override {return pCache_->getHitRate();}
+
+    void
+    tune(int size, int age) override;
+
+    void
+    sweep() override;
+
+private:
+    // Positive cache
+    std::shared_ptr<TaggedCache<uint256, NodeObject>> pCache_;
+
+    // Negative cache
+    std::shared_ptr<KeyCache<uint256>> nCache_;
+
+    // Persistent key/value storage
+    std::unique_ptr<Backend> backend_;
+
+    std::shared_ptr<NodeObject>
+    fetchFrom(uint256 const& hash, std::uint32_t seq) override
+    {
+        return fetchInternal(hash, *backend_);
+    }
+
+    void
+    for_each(std::function<void(std::shared_ptr<NodeObject>)> f) override
+    {
+        backend_->for_each(f);
+    }
+};
+
+}
+}
+
+#endif
--- a/src/ripple/nodestore/impl/DatabaseRotatingImp.cpp
+++ b/src/ripple/nodestore/impl/DatabaseRotatingImp.cpp
@@ -19,37 +19,179 @@

 #include <BeastConfig.h>
 #include <ripple/nodestore/impl/DatabaseRotatingImp.h>
+#include <ripple/app/ledger/Ledger.h>
+#include <ripple/protocol/HashPrefix.h>

 namespace ripple {
 namespace NodeStore {

-// Make sure to call it already locked!
-std::shared_ptr <Backend> DatabaseRotatingImp::rotateBackends (
-        std::shared_ptr <Backend> const& newBackend)
+DatabaseRotatingImp::DatabaseRotatingImp(
+    std::string const& name, Scheduler& scheduler, int readThreads,
+        Stoppable& parent, std::unique_ptr<Backend> writableBackend,
+            std::unique_ptr<Backend> archiveBackend, beast::Journal j)
+    : DatabaseRotating(name, parent, scheduler, readThreads, j)
+    , pCache_(std::make_shared<TaggedCache<uint256, NodeObject>>(
+        name, cacheTargetSize, cacheTargetSeconds, stopwatch(), j))
+    , nCache_(std::make_shared<KeyCache<uint256>>(
+        name, stopwatch(), cacheTargetSize, cacheTargetSeconds))
+    , writableBackend_(std::move(writableBackend))
+    , archiveBackend_(std::move(archiveBackend))
 {
-    std::shared_ptr <Backend> oldBackend = archiveBackend_;
-    archiveBackend_ = writableBackend_;
-    writableBackend_ = newBackend;
+    if (writableBackend_)
+        fdLimit_ += writableBackend_->fdlimit();
+    if (archiveBackend_)
+        fdLimit_ += archiveBackend_->fdlimit();
+}

+// Make sure to call it already locked!
+std::unique_ptr<Backend>
+DatabaseRotatingImp::rotateBackends(
+    std::unique_ptr<Backend> newBackend)
+{
+    auto oldBackend {std::move(archiveBackend_)};
+    archiveBackend_ = std::move(writableBackend_);
+    writableBackend_ = std::move(newBackend);
    return oldBackend;
 }

-std::shared_ptr<NodeObject> DatabaseRotatingImp::fetchFrom (uint256 const& hash)
+void
+DatabaseRotatingImp::store(NodeObjectType type, Blob&& data,
+    uint256 const& hash, std::uint32_t seq)
+{
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+    assert(hash == sha512Hash(makeSlice(data)));
+#endif
+    auto nObj = NodeObject::createObject(type, std::move(data), hash);
+    pCache_->canonicalize(hash, nObj, true);
+    getWritableBackend()->store(nObj);
+    nCache_->erase(hash);
+    storeStats(nObj->getData().size());
+}
+
+bool
+DatabaseRotatingImp::asyncFetch(uint256 const& hash,
+    std::uint32_t seq, std::shared_ptr<NodeObject>& object)
+{
+    // See if the object is in cache
+    object = pCache_->fetch(hash);
+    if (object || nCache_->touch_if_exists(hash))
+        return true;
+    // Otherwise post a read
+    Database::asyncFetch(hash, seq, pCache_, nCache_);
+    return false;
+}
+
+bool
+DatabaseRotatingImp::copyLedger(
+    std::shared_ptr<Ledger const> const& ledger)
+{
+    if (ledger->info().hash.isZero() ||
+        ledger->info().accountHash.isZero())
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "Invalid ledger";
+        return false;
+    }
+    auto& srcDB = const_cast<Database&>(
+        ledger->stateMap().family().db());
+    if (&srcDB == this)
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "Source and destination are the same";
+        return false;
+    }
+    Batch batch;
+    bool error = false;
+    auto f = [&](SHAMapAbstractNode& node) {
+        if (auto nObj = srcDB.fetch(
+            node.getNodeHash().as_uint256(), ledger->info().seq))
+                batch.emplace_back(std::move(nObj));
+        else
+            error = true;
+        return !error;
+    };
+    // Batch the ledger header
+    {
+        Serializer s(1024);
+        s.add32(HashPrefix::ledgerMaster);
+        addRaw(ledger->info(), s);
+        batch.emplace_back(NodeObject::createObject(hotLEDGER,
+            std::move(s.modData()), ledger->info().hash));
+    }
+    // Batch the state map
+    if (ledger->stateMap().getHash().isNonZero())
+    {
+        if (! ledger->stateMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid state map";
+            return false;
+        }
+        ledger->stateMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+    // Batch the transaction map
+    if (ledger->info().txHash.isNonZero())
+    {
+        if (! ledger->txMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid transaction map";
+            return false;
+        }
+        ledger->txMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+    // Store batch
+    for (auto& nObj : batch)
+    {
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+        assert(nObj->getHash() == sha512Hash(makeSlice(nObj->getData())));
+#endif
+        pCache_->canonicalize(nObj->getHash(), nObj, true);
+        nCache_->erase(nObj->getHash());
+        storeStats(nObj->getData().size());
+    }
+    getWritableBackend()->storeBatch(batch);
+    return true;
+}
+
+void
+DatabaseRotatingImp::tune(int size, int age)
+{
+    pCache_->setTargetSize(size);
+    pCache_->setTargetAge(age);
+    nCache_->setTargetSize(size);
+    nCache_->setTargetAge(age);
+}
+
+void
+DatabaseRotatingImp::sweep()
+{
+    pCache_->sweep();
+    nCache_->sweep();
+}
+
+std::shared_ptr<NodeObject>
+DatabaseRotatingImp::fetchFrom(uint256 const& hash, std::uint32_t seq)
 {
    Backends b = getBackends();
-    std::shared_ptr<NodeObject> object = fetchInternal (*b.writableBackend, hash);
-    if (!object)
+    auto nObj = fetchInternal(hash, *b.writableBackend);
+    if (! nObj)
    {
-        object = fetchInternal (*b.archiveBackend, hash);
-        if (object)
+        nObj = fetchInternal(hash, *b.archiveBackend);
+        if (nObj)
        {
-            getWritableBackend()->store (object);
-            m_negCache.erase (hash);
+            getWritableBackend()->store(nObj);
+            nCache_->erase(hash);
        }
    }
-
-    return object;
-}
+    return nObj;
 }

-}
+} // NodeStore
+} // ripple
--- a/src/ripple/nodestore/impl/DatabaseRotatingImp.h
+++ b/src/ripple/nodestore/impl/DatabaseRotatingImp.h
@@ -20,71 +20,40 @@
 #ifndef RIPPLE_NODESTORE_DATABASEROTATINGIMP_H_INCLUDED
 #define RIPPLE_NODESTORE_DATABASEROTATINGIMP_H_INCLUDED

-#include <ripple/nodestore/impl/DatabaseImp.h>
 #include <ripple/nodestore/DatabaseRotating.h>

 namespace ripple {
 namespace NodeStore {

-class DatabaseRotatingImp
-    : public DatabaseImp
-    , public DatabaseRotating
+class DatabaseRotatingImp : public DatabaseRotating
 {
-private:
-    std::shared_ptr <Backend> writableBackend_;
-    std::shared_ptr <Backend> archiveBackend_;
-    mutable std::mutex rotateMutex_;
-
-    struct Backends {
-        std::shared_ptr <Backend> const& writableBackend;
-        std::shared_ptr <Backend> const& archiveBackend;
-    };
-
-    Backends getBackends() const
-    {
-        std::lock_guard <std::mutex> lock (rotateMutex_);
-        return Backends {writableBackend_, archiveBackend_};
-    }
-
 public:
-    DatabaseRotatingImp (std::string const& name,
-                 Scheduler& scheduler,
-                 int readThreads,
-                 Stoppable& parent,
-                 std::shared_ptr <Backend> writableBackend,
-                 std::shared_ptr <Backend> archiveBackend,
-                 beast::Journal journal)
-            : DatabaseImp (
-                name,
-                scheduler,
-                readThreads,
-                parent,
-                std::unique_ptr <Backend>(),
-                journal)
-            , writableBackend_ (writableBackend)
-            , archiveBackend_ (archiveBackend)
-    {}
+    DatabaseRotatingImp() = delete;
+    DatabaseRotatingImp(DatabaseRotatingImp const&) = delete;
+    DatabaseRotatingImp& operator=(DatabaseRotatingImp const&) = delete;

-    ~DatabaseRotatingImp () override
+    DatabaseRotatingImp(std::string const& name,
+        Scheduler& scheduler, int readThreads, Stoppable& parent,
+            std::unique_ptr<Backend> writableBackend,
+                 std::unique_ptr<Backend> archiveBackend,
+                    beast::Journal j);
+
+    ~DatabaseRotatingImp() override
    {
        // Stop threads before data members are destroyed.
-        DatabaseImp::stopThreads ();
+        stopThreads();
    }

-    std::shared_ptr <Backend> const& getWritableBackend() const override
+    std::unique_ptr<Backend> const&
+    getWritableBackend() const override
    {
        std::lock_guard <std::mutex> lock (rotateMutex_);
        return writableBackend_;
    }

-    std::shared_ptr <Backend> const& getArchiveBackend() const override
-    {
-        std::lock_guard <std::mutex> lock (rotateMutex_);
-        return archiveBackend_;
-    }
+    std::unique_ptr<Backend>
+    rotateBackends(std::unique_ptr<Backend> newBackend) override;

-    std::shared_ptr <Backend> rotateBackends (
-            std::shared_ptr <Backend> const& newBackend) override;
    std::mutex& peekMutex() const override
    {
        return rotateMutex_;
@@ -100,35 +69,79 @@ public:
        return getWritableBackend()->getWriteLoad();
    }

-    void for_each (std::function <void(std::shared_ptr<NodeObject>)> f) override
-    {
-        Backends b = getBackends();
-        b.archiveBackend->for_each (f);
-        b.writableBackend->for_each (f);
-    }
-
    void import (Database& source) override
    {
        importInternal (source, *getWritableBackend());
    }

-    void store (NodeObjectType type,
-                Blob&& data,
-                uint256 const& hash) override
+    void store(NodeObjectType type, Blob&& data,
+        uint256 const& hash, std::uint32_t seq) override;
+
+    std::shared_ptr<NodeObject>
+    fetch(uint256 const& hash, std::uint32_t seq) override
    {
-        storeInternal (type, std::move(data), hash,
-                *getWritableBackend());
+        return doFetch(hash, seq, pCache_, nCache_, false);
    }

-    std::shared_ptr<NodeObject> fetchNode (uint256 const& hash) override
+    bool
+    asyncFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<NodeObject>& object) override;
+
+    bool
+    copyLedger(std::shared_ptr<Ledger const> const& ledger) override;
+
+    int
+    getDesiredAsyncReadCount(std::uint32_t seq) override
    {
-        return fetchFrom (hash);
+        // We prefer a client not fill our cache
+        // We don't want to push data out of the cache
+        // before it's retrieved
+        return pCache_->getTargetSize() / asyncDivider;
    }

-    std::shared_ptr<NodeObject> fetchFrom (uint256 const& hash) override;
-    TaggedCache <uint256, NodeObject>& getPositiveCache() override
+    float
+    getCacheHitRate() override {return pCache_->getHitRate();}
+
+    void
+    tune(int size, int age) override;
+
+    void
+    sweep() override;
+
+    TaggedCache<uint256, NodeObject> const&
+    getPositiveCache() override {return *pCache_;}
+
+private:
+    // Positive cache
+    std::shared_ptr<TaggedCache<uint256, NodeObject>> pCache_;
+
+    // Negative cache
+    std::shared_ptr<KeyCache<uint256>> nCache_;
+
+    std::unique_ptr<Backend> writableBackend_;
+    std::unique_ptr<Backend> archiveBackend_;
+    mutable std::mutex rotateMutex_;
+
+    struct Backends {
+        std::unique_ptr<Backend> const& writableBackend;
+        std::unique_ptr<Backend> const& archiveBackend;
+    };
+
+    Backends getBackends() const
    {
-        return m_cache;
+        std::lock_guard <std::mutex> lock (rotateMutex_);
+        return Backends {writableBackend_, archiveBackend_};
+    }
+
+    std::shared_ptr<NodeObject> fetchFrom(
+        uint256 const& hash, std::uint32_t seq) override;
+
+    void
+    for_each(std::function <void(std::shared_ptr<NodeObject>)> f) override
+    {
+        Backends b = getBackends();
+        b.archiveBackend->for_each(f);
+        b.writableBackend->for_each(f);
    }
 };

--- a/src/ripple/nodestore/impl/DatabaseShardImp.cpp
+++ b/src/ripple/nodestore/impl/DatabaseShardImp.cpp
@@ -0,0 +1,745 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2017 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#include <BeastConfig.h>
+
+#include <ripple/nodestore/impl/DatabaseShardImp.h>
+#include <ripple/app/ledger/InboundLedgers.h>
+#include <ripple/app/ledger/Ledger.h>
+#include <ripple/basics/chrono.h>
+#include <ripple/basics/random.h>
+#include <ripple/nodestore/Manager.h>
+#include <ripple/protocol/HashPrefix.h>
+
+namespace ripple {
+namespace NodeStore {
+
+std::uint32_t DatabaseShard::lps_ {16384u};
+
+DatabaseShardImp::DatabaseShardImp(Application& app,
+    std::string const& name, Stoppable& parent, Scheduler& scheduler,
+        int readThreads, Section const& config, beast::Journal j)
+    : DatabaseShard(name, parent, scheduler, readThreads, config, j)
+    , app_(app)
+    , config_(config)
+    , dir_(get<std::string>(config, "path"))
+    , maxDiskSpace_(get<std::uint64_t>(config, "max_size_gb") << 30)
+    , avgShardSz_(lps_ * (192 * 1024))
+{
+}
+
+DatabaseShardImp::~DatabaseShardImp()
+{
+    // Stop threads before data members are destroyed
+    stopThreads();
+}
+
+bool
+DatabaseShardImp::init()
+{
+    std::lock_guard<std::mutex> l(m_);
+    if (init_)
+    {
+        JLOG(j_.error()) <<
+            "Already initialized";
+        return false;
+    }
+
+    using namespace boost::filesystem;
+    // Find backend type and file handle requirement
+    try
+    {
+        fdLimit_ = Manager::instance().make_Backend(
+            config_, scheduler_, j_)->fdlimit();
+    }
+    catch (std::exception const&)
+    {
+        JLOG(j_.error()) <<
+            "Invalid or missing shard store "
+            "type specified in [shard_db]";
+        return false;
+    }
+
+    backed_ = static_cast<bool>(fdLimit_);
+    if (!backed_)
+    {
+        init_ = true;
+        return true;
+    }
+
+    auto const genesisShardIndex {seqToShardIndex(genesisSeq)};
+    // Find shards
+    for (auto const& d : directory_iterator(dir_))
+    {
+        if (!is_directory(d))
+            continue;
+        auto dirName = d.path().stem().string();
+        if (!std::all_of(dirName.begin(), dirName.end(), ::isdigit))
+            continue;
+        auto const shardIndex {std::stoul(dirName)};
+        if (shardIndex < genesisShardIndex)
+            continue;
+        auto shard = std::make_unique<Shard>(
+            shardIndex, cacheSz_, cacheAge_, j_);
+        if (!shard->open(config_, scheduler_, dir_))
+            return false;
+        usedDiskSpace_ += shard->fileSize();
+        if (shard->complete())
+            complete_.emplace(shard->index(), std::move(shard));
+        else
+        {
+            if (incomplete_)
+            {
+                JLOG(j_.error()) <<
+                    "More than one control file found";
+                return false;
+            }
+            incomplete_ = std::move(shard);
+        }
+    }
+    if (!incomplete_ && complete_.empty())
+    {
+        // New Shard Store, calculate file descriptor requirements
+        if (maxDiskSpace_ > space(dir_).free)
+        {
+            JLOG(j_.warn()) <<
+                "Insufficient disk space";
+        }
+        fdLimit_ = 1 + (fdLimit_ *
+            std::max<std::uint64_t>(1, maxDiskSpace_ / avgShardSz_));
+    }
+    else
+        updateStats(l);
+    init_ = true;
+    return true;
+}
+
+boost::optional<std::uint32_t>
+DatabaseShardImp::prepare(std::uint32_t validLedgerSeq)
+{
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    if (incomplete_)
+        return incomplete_->prepare();
+    if (!canAdd_)
+        return boost::none;
+    if (backed_)
+    {
+        // Create a new shard to acquire
+        if (usedDiskSpace_ + avgShardSz_ > maxDiskSpace_)
+        {
+            JLOG(j_.debug()) <<
+                "Maximum size reached";
+            canAdd_ = false;
+            return boost::none;
+        }
+        if (avgShardSz_ > boost::filesystem::space(dir_).free)
+        {
+            JLOG(j_.warn()) <<
+                "Insufficient disk space";
+            canAdd_ = false;
+            return boost::none;
+        }
+    }
+
+    auto const shardIndex {findShardIndexToAdd(validLedgerSeq, l)};
+    if (!shardIndex)
+    {
+        JLOG(j_.debug()) <<
+            "No new shards to add";
+        canAdd_ = false;
+        return boost::none;
+    }
+    // With every new shard, clear family caches
+    app_.shardFamily()->reset();
+    int const sz {std::max(shardCacheSz, cacheSz_ / std::max(
+        1, static_cast<int>(complete_.size() + 1)))};
+    incomplete_ = std::make_unique<Shard>(
+        *shardIndex, sz, cacheAge_, j_);
+    if (!incomplete_->open(config_, scheduler_, dir_))
+    {
+        incomplete_.reset();
+        remove_all(dir_ / std::to_string(*shardIndex));
+        return boost::none;
+    }
+    return incomplete_->prepare();
+}
+
+std::shared_ptr<Ledger>
+DatabaseShardImp::fetchLedger(uint256 const& hash, std::uint32_t seq)
+{
+    if (!contains(seq))
+        return {};
+    auto nObj = fetch(hash, seq);
+    if (!nObj)
+        return {};
+
+    auto ledger = std::make_shared<Ledger>(
+        InboundLedger::deserializeHeader(makeSlice(nObj->getData()), true),
+            app_.config(), *app_.shardFamily());
+    if (ledger->info().hash != hash || ledger->info().seq != seq)
+    {
+        JLOG(j_.error()) <<
+            "shard " << seqToShardIndex(seq) <<
+            " ledger seq " << seq <<
+            " hash " << hash <<
+            " has corrupt data";
+        return {};
+    }
+    ledger->setFull();
+    if (!ledger->stateMap().fetchRoot(
+        SHAMapHash {ledger->info().accountHash}, nullptr))
+    {
+        JLOG(j_.error()) <<
+            "shard " << seqToShardIndex(seq) <<
+            " ledger seq " << seq <<
+            " missing Account State root";
+        return {};
+    }
+    if (ledger->info().txHash.isNonZero())
+    {
+        if (!ledger->txMap().fetchRoot(
+            SHAMapHash {ledger->info().txHash}, nullptr))
+        {
+            JLOG(j_.error()) <<
+                "shard " << seqToShardIndex(seq) <<
+                " ledger seq " << seq <<
+                " missing TX root";
+            return {};
+        }
+    }
+    return ledger;
+}
+
+void
+DatabaseShardImp::setStored(std::shared_ptr<Ledger const> const& ledger)
+{
+    if (ledger->info().hash.isZero() ||
+        ledger->info().accountHash.isZero())
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "Invalid ledger";
+        return;
+    }
+    auto const shardIndex {seqToShardIndex(ledger->info().seq)};
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    if (!incomplete_ || shardIndex != incomplete_->index())
+    {
+        JLOG(j_.warn()) <<
+            "ledger seq " << ledger->info().seq <<
+            " is not being acquired";
+        return;
+    }
+
+    auto const before {incomplete_->fileSize()};
+    if (!incomplete_->setStored(ledger))
+        return;
+    auto const after {incomplete_->fileSize()};
+     if(after > before)
+         usedDiskSpace_ += (after - before);
+     else if(after < before)
+         usedDiskSpace_ -= std::min(before - after, usedDiskSpace_);
+
+    if (incomplete_->complete())
+    {
+        complete_.emplace(incomplete_->index(), std::move(incomplete_));
+        incomplete_.reset();
+        updateStats(l);
+    }
+}
+
+bool
+DatabaseShardImp::contains(std::uint32_t seq)
+{
+    auto const shardIndex {seqToShardIndex(seq)};
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    if (complete_.find(shardIndex) != complete_.end())
+        return true;
+    if (incomplete_ && incomplete_->index() == shardIndex)
+        return incomplete_->contains(seq);
+    return false;
+}
+
+std::string
+DatabaseShardImp::getCompleteShards()
+{
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    return status_;
+}
+
+void
+DatabaseShardImp::validate()
+{
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        if (complete_.empty() && !incomplete_)
+        {
+            JLOG(j_.fatal()) <<
+                "No shards to validate";
+            return;
+        }
+
+        std::string s {"Found shards "};
+        for (auto& e : complete_)
+            s += std::to_string(e.second->index()) + ",";
+        if (incomplete_)
+            s += std::to_string(incomplete_->index());
+        else
+            s.pop_back();
+        JLOG(j_.fatal()) << s;
+    }
+
+    for (auto& e : complete_)
+    {
+        app_.shardFamily()->reset();
+        e.second->validate(app_);
+    }
+    if (incomplete_)
+    {
+        app_.shardFamily()->reset();
+        incomplete_->validate(app_);
+    }
+    app_.shardFamily()->reset();
+}
+
+std::int32_t
+DatabaseShardImp::getWriteLoad() const
+{
+    std::int32_t wl {0};
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        for (auto const& c : complete_)
+            wl += c.second->getBackend()->getWriteLoad();
+        if (incomplete_)
+            wl += incomplete_->getBackend()->getWriteLoad();
+    }
+    return wl;
+}
+
+void
+DatabaseShardImp::store(NodeObjectType type,
+    Blob&& data, uint256 const& hash, std::uint32_t seq)
+{
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+    assert(hash == sha512Hash(makeSlice(data)));
+#endif
+    std::shared_ptr<NodeObject> nObj;
+    auto const shardIndex {seqToShardIndex(seq)};
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        if (!incomplete_ || shardIndex != incomplete_->index())
+        {
+            JLOG(j_.warn()) <<
+               "ledger seq " << seq <<
+                " is not being acquired";
+            return;
+        }
+        nObj = NodeObject::createObject(
+            type, std::move(data), hash);
+        incomplete_->pCache()->canonicalize(hash, nObj, true);
+        incomplete_->getBackend()->store(nObj);
+        incomplete_->nCache()->erase(hash);
+    }
+    storeStats(nObj->getData().size());
+}
+
+std::shared_ptr<NodeObject>
+DatabaseShardImp::fetch(uint256 const& hash, std::uint32_t seq)
+{
+    auto cache {selectCache(seq)};
+    if (cache.first)
+        return doFetch(hash, seq, cache.first, cache.second, false);
+    return {};
+}
+
+bool
+DatabaseShardImp::asyncFetch(uint256 const& hash,
+    std::uint32_t seq, std::shared_ptr<NodeObject>& object)
+{
+    auto cache {selectCache(seq)};
+    if (cache.first)
+    {
+        // See if the object is in cache
+        object = cache.first->fetch(hash);
+        if (object || cache.second->touch_if_exists(hash))
+            return true;
+        // Otherwise post a read
+        Database::asyncFetch(hash, seq, cache.first, cache.second);
+    }
+    return false;
+}
+
+bool
+DatabaseShardImp::copyLedger(std::shared_ptr<Ledger const> const& ledger)
+{
+    if (ledger->info().hash.isZero() ||
+        ledger->info().accountHash.isZero())
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "source ledger seq " << ledger->info().seq <<
+            " is invalid";
+        return false;
+    }
+    auto& srcDB = const_cast<Database&>(
+        ledger->stateMap().family().db());
+    if (&srcDB == this)
+    {
+        assert(false);
+        JLOG(j_.error()) <<
+            "same source and destination databases";
+        return false;
+    }
+    auto const shardIndex {seqToShardIndex(ledger->info().seq)};
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    if (!incomplete_ || shardIndex != incomplete_->index())
+    {
+        JLOG(j_.warn()) <<
+            "source ledger seq " << ledger->info().seq <<
+            " is not being acquired";
+        return false;
+    }
+
+    // Store the ledger header
+    {
+        Serializer s(1024);
+        s.add32(HashPrefix::ledgerMaster);
+        addRaw(ledger->info(), s);
+        auto nObj = NodeObject::createObject(hotLEDGER,
+            std::move(s.modData()), ledger->info().hash);
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+        assert(nObj->getHash() == sha512Hash(makeSlice(nObj->getData())));
+#endif
+        incomplete_->pCache()->canonicalize(
+            nObj->getHash(), nObj, true);
+        incomplete_->getBackend()->store(nObj);
+        incomplete_->nCache()->erase(nObj->getHash());
+        storeStats(nObj->getData().size());
+    }
+    auto next = incomplete_->lastStored();
+    bool error = false;
+    auto f = [&](SHAMapAbstractNode& node) {
+        if (auto nObj = srcDB.fetch(
+            node.getNodeHash().as_uint256(), ledger->info().seq))
+        {
+#if RIPPLE_VERIFY_NODEOBJECT_KEYS
+            assert(nObj->getHash() == sha512Hash(makeSlice(nObj->getData())));
+#endif
+            incomplete_->pCache()->canonicalize(
+                nObj->getHash(), nObj, true);
+            incomplete_->getBackend()->store(nObj);
+            incomplete_->nCache()->erase(nObj->getHash());
+            storeStats(nObj->getData().size());
+        }
+        else
+            error = true;
+        return !error;
+    };
+    // Store the state map
+    if (ledger->stateMap().getHash().isNonZero())
+    {
+        if (!ledger->stateMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "source ledger seq " << ledger->info().seq <<
+                " state map invalid";
+            return false;
+        }
+        if (next && next->info().parentHash == ledger->info().hash)
+        {
+            auto have = next->stateMap().snapShot(false);
+            ledger->stateMap().snapShot(false)->visitDifferences(&(*have), f);
+        }
+        else
+            ledger->stateMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+    // Store the transaction map
+    if (ledger->info().txHash.isNonZero())
+    {
+        if (!ledger->txMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "source ledger seq " << ledger->info().seq <<
+                " transaction map invalid";
+            return false;
+        }
+        ledger->txMap().snapShot(false)->visitNodes(f);
+        if (error)
+            return false;
+    }
+
+    auto const before {incomplete_->fileSize()};
+    if (!incomplete_->setStored(ledger))
+        return false;
+    auto const after {incomplete_->fileSize()};
+     if(after > before)
+         usedDiskSpace_ += (after - before);
+     else if(after < before)
+         usedDiskSpace_ -= std::min(before - after, usedDiskSpace_);
+
+    if (incomplete_->complete())
+    {
+        complete_.emplace(incomplete_->index(), std::move(incomplete_));
+        incomplete_.reset();
+        updateStats(l);
+    }
+    return true;
+}
+
+int
+DatabaseShardImp::getDesiredAsyncReadCount(std::uint32_t seq)
+{
+    auto const shardIndex {seqToShardIndex(seq)};
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        auto it = complete_.find(shardIndex);
+        if (it != complete_.end())
+            return it->second->pCache()->getTargetSize() / asyncDivider;
+        if (incomplete_ && incomplete_->index() == shardIndex)
+            return incomplete_->pCache()->getTargetSize() / asyncDivider;
+    }
+    return cacheTargetSize / asyncDivider;
+}
+
+float
+DatabaseShardImp::getCacheHitRate()
+{
+    float sz, f {0};
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        sz = complete_.size();
+        for (auto const& c : complete_)
+            f += c.second->pCache()->getHitRate();
+        if (incomplete_)
+        {
+            f += incomplete_->pCache()->getHitRate();
+            ++sz;
+        }
+    }
+    return f / std::max(1.0f, sz);
+}
+
+void
+DatabaseShardImp::tune(int size, int age)
+{
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    cacheSz_ = size;
+    cacheAge_ = age;
+    int const sz {calcTargetCacheSz(l)};
+    for (auto const& c : complete_)
+    {
+        c.second->pCache()->setTargetSize(sz);
+        c.second->pCache()->setTargetAge(cacheAge_);
+        c.second->nCache()->setTargetSize(sz);
+        c.second->nCache()->setTargetAge(cacheAge_);
+    }
+    if (incomplete_)
+    {
+        incomplete_->pCache()->setTargetSize(sz);
+        incomplete_->pCache()->setTargetAge(cacheAge_);
+        incomplete_->nCache()->setTargetSize(sz);
+        incomplete_->nCache()->setTargetAge(cacheAge_);
+    }
+}
+
+void
+DatabaseShardImp::sweep()
+{
+    std::lock_guard<std::mutex> l(m_);
+    assert(init_);
+    int const sz {calcTargetCacheSz(l)};
+    for (auto const& c : complete_)
+    {
+        c.second->pCache()->sweep();
+        c.second->nCache()->sweep();
+        if (c.second->pCache()->getTargetSize() > sz)
+            c.second->pCache()->setTargetSize(sz);
+    }
+    if (incomplete_)
+    {
+        incomplete_->pCache()->sweep();
+        incomplete_->nCache()->sweep();
+        if (incomplete_->pCache()->getTargetSize() > sz)
+            incomplete_->pCache()->setTargetSize(sz);
+    }
+}
+
+std::shared_ptr<NodeObject>
+DatabaseShardImp::fetchFrom(uint256 const& hash, std::uint32_t seq)
+{
+    std::shared_ptr<Backend> backend;
+    auto const shardIndex {seqToShardIndex(seq)};
+    {
+        std::unique_lock<std::mutex> l(m_);
+        assert(init_);
+        auto it = complete_.find(shardIndex);
+        if (it != complete_.end())
+            backend = it->second->getBackend();
+        else if (incomplete_ && incomplete_->index() == shardIndex)
+            backend = incomplete_->getBackend();
+        else
+            return {};
+    }
+    return fetchInternal(hash, *backend);
+}
+
+boost::optional<std::uint32_t>
+DatabaseShardImp::findShardIndexToAdd(
+    std::uint32_t validLedgerSeq, std::lock_guard<std::mutex>&)
+{
+    auto maxShardIndex {seqToShardIndex(validLedgerSeq)};
+    if (validLedgerSeq != lastSeq(maxShardIndex))
+        --maxShardIndex;
+
+    auto const numShards {complete_.size() + (incomplete_ ? 1 : 0)};
+    // If equal, have all the shards
+    if (numShards >= maxShardIndex + 1)
+        return boost::none;
+
+    auto const genesisShardIndex {seqToShardIndex(genesisSeq)};
+
+    if (maxShardIndex < 1024 || float(numShards) / maxShardIndex > 0.5f)
+    {
+        // Small or mostly full index space to sample
+        // Find the available indexes and select one at random
+        std::vector<std::uint32_t> available;
+        available.reserve(maxShardIndex - numShards + 1);
+        for (std::uint32_t i = genesisShardIndex; i <= maxShardIndex; ++i)
+        {
+            if (complete_.find(i) == complete_.end() &&
+                (!incomplete_ || incomplete_->index() != i))
+                    available.push_back(i);
+        }
+        if (!available.empty())
+            return available[rand_int(0u,
+                static_cast<std::uint32_t>(available.size() - 1))];
+    }
+
+    // Large, sparse index space to sample
+    // Keep choosing indexes at random until an available one is found
+    // chances of running more than 30 times is less than 1 in a billion
+    for (int i = 0; i < 40; ++i)
+    {
+        auto const r = rand_int(genesisShardIndex, maxShardIndex);
+        if (complete_.find(r) == complete_.end() &&
+            (!incomplete_ || incomplete_->index() != r))
+                return r;
+    }
+    assert(0);
+    return boost::none;
+}
+
+void
+DatabaseShardImp::updateStats(std::lock_guard<std::mutex>&)
+{
+    // Calculate shard file sizes and update status string
+    std::uint32_t filesPerShard {0};
+    if (!complete_.empty())
+    {
+        status_.clear();
+        filesPerShard = complete_.begin()->second->fdlimit();
+        std::uint64_t avgShardSz {0};
+        for (auto it = complete_.begin(); it != complete_.end(); ++it)
+        {
+            if (it == complete_.begin())
+                status_ = std::to_string(it->first);
+            else
+            {
+                if (it->first - std::prev(it)->first > 1)
+                {
+                    if (status_.back() == '-')
+                        status_ += std::to_string(std::prev(it)->first);
+                    status_ += "," + std::to_string(it->first);
+                }
+                else
+                {
+                    if (status_.back() != '-')
+                        status_ += "-";
+                    if (std::next(it) == complete_.end())
+                        status_ += std::to_string(it->first);
+                }
+            }
+            avgShardSz += it->second->fileSize();
+        }
+        if (backed_)
+            avgShardSz_ = avgShardSz / complete_.size();
+    }
+    else if(incomplete_)
+        filesPerShard = incomplete_->fdlimit();
+    if (!backed_)
+        return;
+
+    fdLimit_ = 1 + (filesPerShard *
+        (complete_.size() + (incomplete_ ? 1 : 0)));
+
+    if (usedDiskSpace_ >= maxDiskSpace_)
+    {
+        JLOG(j_.warn()) <<
+            "Maximum size reached";
+        canAdd_ = false;
+    }
+    else
+    {
+        auto const sz = maxDiskSpace_ - usedDiskSpace_;
+        if (sz > space(dir_).free)
+        {
+            JLOG(j_.warn()) <<
+                "Max Shard Store size exceeds "
+                "remaining free disk space";
+        }
+        fdLimit_ += (filesPerShard * (sz / avgShardSz_));
+    }
+}
+
+std::pair<std::shared_ptr<PCache>, std::shared_ptr<NCache>>
+DatabaseShardImp::selectCache(std::uint32_t seq)
+{
+    std::pair<std::shared_ptr<PCache>,
+        std::shared_ptr<NCache>> cache;
+    auto const shardIndex {seqToShardIndex(seq)};
+    {
+        std::lock_guard<std::mutex> l(m_);
+        assert(init_);
+        auto it = complete_.find(shardIndex);
+        if (it != complete_.end())
+            cache = std::make_pair(it->second->pCache(),
+                it->second->nCache());
+        else if (incomplete_ && incomplete_->index() == shardIndex)
+            cache = std::make_pair(incomplete_->pCache(),
+                incomplete_->nCache());
+    }
+    return cache;
+}
+
+
+} // NodeStore
+} // ripple
--- a/src/ripple/nodestore/impl/DatabaseShardImp.h
+++ b/src/ripple/nodestore/impl/DatabaseShardImp.h
@@ -0,0 +1,171 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2017 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#ifndef RIPPLE_NODESTORE_DATABASESHARDIMP_H_INCLUDED
+#define RIPPLE_NODESTORE_DATABASESHARDIMP_H_INCLUDED
+
+#include <ripple/nodestore/DatabaseShard.h>
+#include <ripple/nodestore/impl/Shard.h>
+
+namespace ripple {
+namespace NodeStore {
+
+class DatabaseShardImp : public DatabaseShard
+{
+public:
+    DatabaseShardImp() = delete;
+    DatabaseShardImp(DatabaseShardImp const&) = delete;
+    DatabaseShardImp& operator=(DatabaseShardImp const&) = delete;
+
+    DatabaseShardImp(Application& app, std::string const& name,
+        Stoppable& parent, Scheduler& scheduler, int readThreads,
+            Section const& config, beast::Journal j);
+
+    ~DatabaseShardImp() override;
+
+    bool
+    init() override;
+
+    boost::optional<std::uint32_t>
+    prepare(std::uint32_t validLedgerSeq) override;
+
+    std::shared_ptr<Ledger>
+    fetchLedger(uint256 const& hash, std::uint32_t seq) override;
+
+    void
+    setStored(std::shared_ptr<Ledger const> const& ledger) override;
+
+    bool
+    contains(std::uint32_t seq) override;
+
+    std::string
+    getCompleteShards() override;
+
+    void
+    validate() override;
+
+    std::string
+    getName() const override
+    {
+        return "shardstore";
+    }
+
+    void
+    import(Database& source) override
+    {
+        Throw<std::runtime_error>("Shard store import not supported");
+    }
+
+    std::int32_t
+    getWriteLoad() const override;
+
+    void
+    store(NodeObjectType type, Blob&& data,
+        uint256 const& hash, std::uint32_t seq) override;
+
+    std::shared_ptr<NodeObject>
+    fetch(uint256 const& hash, std::uint32_t seq) override;
+
+    bool
+    asyncFetch(uint256 const& hash, std::uint32_t seq,
+        std::shared_ptr<NodeObject>& object) override;
+
+    bool
+    copyLedger(std::shared_ptr<Ledger const> const& ledger) override;
+
+    int
+    getDesiredAsyncReadCount(std::uint32_t seq) override;
+
+    float
+    getCacheHitRate() override;
+
+    void
+    tune(int size, int age) override;
+
+    void
+    sweep() override;
+
+private:
+    Application& app_;
+    mutable std::mutex m_;
+    bool init_ {false};
+    std::map<std::uint32_t, std::unique_ptr<Shard>> complete_;
+    std::unique_ptr<Shard> incomplete_;
+    Section const config_;
+    boost::filesystem::path dir_;
+
+    // If new shards can be stored
+    bool canAdd_ {true};
+
+    // Complete shard indexes
+    std::string status_;
+
+    // If backend type uses permanent storage
+    bool backed_;
+
+    // Maximum disk space the DB can use (in bytes)
+    std::uint64_t const maxDiskSpace_;
+
+    // Disk space used to store the shards (in bytes)
+    std::uint64_t usedDiskSpace_ {0};
+
+    // Average disk space a shard requires (in bytes)
+    std::uint64_t avgShardSz_;
+
+    // Shard cache tuning
+    int cacheSz_ {shardCacheSz};
+    PCache::clock_type::rep cacheAge_ {shardCacheSeconds};
+
+    std::shared_ptr<NodeObject>
+    fetchFrom(uint256 const& hash, std::uint32_t seq) override;
+
+    void
+    for_each(std::function <void(std::shared_ptr<NodeObject>)> f) override
+    {
+        Throw<std::runtime_error>("Shard store import not supported");
+    }
+
+    // Finds a random shard index that is not stored
+    // Lock must be held
+    boost::optional<std::uint32_t>
+    findShardIndexToAdd(std::uint32_t validLedgerSeq,
+        std::lock_guard<std::mutex>&);
+
+    // Updates stats
+    // Lock must be held
+    void
+    updateStats(std::lock_guard<std::mutex>&);
+
+    std::pair<std::shared_ptr<PCache>, std::shared_ptr<NCache>>
+    selectCache(std::uint32_t seq);
+
+    // Returns the tune cache size divided by the number of shards
+    // Lock must be held
+    int
+    calcTargetCacheSz(std::lock_guard<std::mutex>&) const
+    {
+        return std::max(shardCacheSz, cacheSz_ / std::max(
+            1, static_cast<int>(complete_.size() + (incomplete_ ? 1 : 0))));
+    }
+};
+
+} // NodeStore
+} // ripple
+
+#endif
--- a/src/ripple/nodestore/impl/ManagerImp.cpp
+++ b/src/ripple/nodestore/impl/ManagerImp.cpp
@@ -19,7 +19,7 @@

 #include <BeastConfig.h>
 #include <ripple/nodestore/impl/ManagerImp.h>
-#include <ripple/nodestore/impl/DatabaseRotatingImp.h>
+#include <ripple/nodestore/impl/DatabaseNodeImp.h>

 namespace ripple {
 namespace NodeStore {
@@ -54,30 +54,16 @@ ManagerImp::make_Backend (
    Scheduler& scheduler,
    beast::Journal journal)
 {
-    std::unique_ptr <Backend> backend;
+    std::string const type {get<std::string>(parameters, "type")};
+    if (type.empty())
+        missing_backend();

-    std::string const type (get<std::string>(parameters, "type"));
+    auto factory {find(type)};
+    if(!factory)
+        missing_backend();

-    if (! type.empty ())
-    {
-        Factory* const factory (find (type));
-
-        if (factory != nullptr)
-        {
-            backend = factory->createInstance (
-                NodeObject::keyBytes, parameters, scheduler, journal);
-        }
-        else
-        {
-            missing_backend ();
-        }
-    }
-    else
-    {
-        missing_backend ();
-    }
-
-    return backend;
+    return factory->createInstance(
+        NodeObject::keyBytes, parameters, scheduler, journal);
 }

 std::unique_ptr <Database>
@@ -89,53 +75,18 @@ ManagerImp::make_Database (
    Section const& backendParameters,
    beast::Journal journal)
 {
-    return std::make_unique <DatabaseImp> (
+    auto backend {make_Backend(
+        backendParameters, scheduler, journal)};
+    backend->open();
+    return std::make_unique <DatabaseNodeImp> (
        name,
        scheduler,
        readThreads,
        parent,
-        make_Backend (
-            backendParameters,
-            scheduler,
-            journal),
+        std::move(backend),
        journal);
 }

-std::unique_ptr <DatabaseRotating>
-ManagerImp::make_DatabaseRotating (
-        std::string const& name,
-        Scheduler& scheduler,
-        std::int32_t readThreads,
-        Stoppable& parent,
-        std::shared_ptr <Backend> writableBackend,
-        std::shared_ptr <Backend> archiveBackend,
-        beast::Journal journal)
-{
-    return std::make_unique <DatabaseRotatingImp> (
-        name,
-        scheduler,
-        readThreads,
-        parent,
-        writableBackend,
-        archiveBackend,
-        journal);
-}
-
-Factory*
-ManagerImp::find (std::string const& name)
-{
-    std::lock_guard<std::mutex> _(mutex_);
-    auto const iter = std::find_if(list_.begin(), list_.end(),
-        [&name](Factory* other)
-        {
-            return beast::detail::iequals(name, other->getName());
-        } );
-    if (iter == list_.end())
-        return nullptr;
-    return *iter;
-}
-
-
 void
 ManagerImp::insert (Factory& factory)
 {
@@ -153,6 +104,20 @@ ManagerImp::erase (Factory& factory)
    list_.erase(iter);
 }

+Factory*
+ManagerImp::find (std::string const& name)
+{
+    std::lock_guard<std::mutex> _(mutex_);
+    auto const iter = std::find_if(list_.begin(), list_.end(),
+        [&name](Factory* other)
+        {
+            return beast::detail::iequals(name, other->getName());
+        } );
+    if (iter == list_.end())
+        return nullptr;
+    return *iter;
+}
+
 //------------------------------------------------------------------------------

 Manager&
--- a/src/ripple/nodestore/impl/ManagerImp.h
+++ b/src/ripple/nodestore/impl/ManagerImp.h
@@ -45,7 +45,7 @@ public:
    ~ManagerImp();

    Factory*
-    find (std::string const& name);
+    find (std::string const& name) override;

    void
    insert (Factory& factory) override;
@@ -67,16 +67,6 @@ public:
        Stoppable& parent,
        Section const& backendParameters,
        beast::Journal journal) override;
-
-    std::unique_ptr <DatabaseRotating>
-    make_DatabaseRotating (
-        std::string const& name,
-        Scheduler& scheduler,
-        std::int32_t readThreads,
-        Stoppable& parent,
-        std::shared_ptr <Backend> writableBackend,
-        std::shared_ptr <Backend> archiveBackend,
-        beast::Journal journal) override;
 };

 }
--- a/src/ripple/nodestore/impl/Shard.cpp
+++ b/src/ripple/nodestore/impl/Shard.cpp
@@ -0,0 +1,454 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2017 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#include <BeastConfig.h>
+#include <ripple/nodestore/impl/Shard.h>
+#include <ripple/app/ledger/InboundLedger.h>
+#include <ripple/nodestore/impl/DatabaseShardImp.h>
+#include <ripple/nodestore/Manager.h>
+
+namespace ripple {
+namespace NodeStore {
+
+Shard::Shard(std::uint32_t index, int cacheSz,
+    PCache::clock_type::rep cacheAge,
+    beast::Journal& j)
+    : index_(index)
+    , firstSeq_(std::max(genesisSeq,
+        DatabaseShard::firstSeq(index)))
+    , lastSeq_(std::max(firstSeq_,
+        DatabaseShard::lastSeq(index)))
+    , pCache_(std::make_shared<PCache>(
+        "shard " + std::to_string(index_),
+        cacheSz, cacheAge, stopwatch(), j))
+    , nCache_(std::make_shared<NCache>(
+        "shard " + std::to_string(index_),
+        stopwatch(), cacheSz, cacheAge))
+    , j_(j)
+{
+    if (index_ < DatabaseShard::seqToShardIndex(genesisSeq))
+        Throw<std::runtime_error>("Shard: Invalid index");
+}
+
+bool
+Shard::open(Section config, Scheduler& scheduler,
+    boost::filesystem::path dir)
+{
+    assert(!backend_);
+    using namespace boost::filesystem;
+    dir_ = dir / std::to_string(index_);
+    config.set("path", dir_.string());
+    auto newShard {!is_directory(dir_) || is_empty(dir_)};
+    try
+    {
+        backend_ = Manager::instance().make_Backend(
+            config, scheduler, j_);
+        backend_->open();
+    }
+    catch (std::exception const& e)
+    {
+        JLOG(j_.error()) <<
+            "shard " << index_ <<
+            " exception: " << e.what();
+        return false;
+    }
+
+    if (backend_->fdlimit() == 0)
+        return true;
+
+    control_ = dir_ / controlFileName;
+    if (newShard)
+    {
+        if (!saveControl())
+            return false;
+    }
+    else if (is_regular_file(control_))
+    {
+        std::ifstream ifs(control_.string());
+        if (!ifs.is_open())
+        {
+            JLOG(j_.error()) <<
+                "shard " << index_ <<
+                " unable to open control file";
+            return false;
+        }
+        boost::archive::text_iarchive ar(ifs);
+        ar & storedSeqs_;
+        if (!storedSeqs_.empty())
+        {
+            if (boost::icl::first(storedSeqs_) < firstSeq_ ||
+                boost::icl::last(storedSeqs_) > lastSeq_)
+            {
+                JLOG(j_.error()) <<
+                    "shard " << index_ <<
+                    " invalid control file";
+                return false;
+            }
+
+            auto const genesisShardIndex {
+                DatabaseShard::seqToShardIndex(genesisSeq)};
+            auto const genesisNumLedgers {
+                DatabaseShard::ledgersPerShard() - (
+                    genesisSeq - DatabaseShardImp::firstSeq(
+                        genesisShardIndex))};
+            if (boost::icl::length(storedSeqs_) ==
+                (index_ == genesisShardIndex ? genesisNumLedgers :
+                    DatabaseShard::ledgersPerShard()))
+            {
+                JLOG(j_.error()) <<
+                    "shard " << index_ <<
+                    " found control file for complete shard";
+                storedSeqs_.clear();
+                remove(control_);
+                complete_ = true;
+            }
+        }
+    }
+    else
+        complete_ = true;
+    updateFileSize();
+    return true;
+}
+
+bool
+Shard::setStored(std::shared_ptr<Ledger const> const& l)
+{
+    assert(backend_&& !complete_);
+    if (boost::icl::contains(storedSeqs_, l->info().seq))
+    {
+        JLOG(j_.debug()) <<
+            "shard " << index_ <<
+            " ledger seq " << l->info().seq <<
+            " already stored";
+        return false;
+    }
+    auto const genesisShardIndex {
+        DatabaseShard::seqToShardIndex(genesisSeq)};
+    auto const genesisNumLedgers {
+        DatabaseShard::ledgersPerShard() - (
+            genesisSeq - DatabaseShardImp::firstSeq(
+                genesisShardIndex))};
+    if (boost::icl::length(storedSeqs_) >=
+        (index_ == genesisShardIndex ? genesisNumLedgers :
+            DatabaseShard::ledgersPerShard()) - 1)
+    {
+        if (backend_->fdlimit() != 0)
+        {
+            remove(control_);
+            updateFileSize();
+        }
+        complete_ = true;
+        storedSeqs_.clear();
+
+        JLOG(j_.debug()) <<
+            "shard " << index_ << " complete";
+    }
+    else
+    {
+        storedSeqs_.insert(l->info().seq);
+        lastStored_ = l;
+        if (backend_->fdlimit() != 0 && !saveControl())
+            return false;
+    }
+
+    JLOG(j_.debug()) <<
+        "shard " << index_ <<
+        " ledger seq " << l->info().seq <<
+        " stored";
+
+    return true;
+}
+
+boost::optional<std::uint32_t>
+Shard::prepare()
+{
+    if (storedSeqs_.empty())
+         return lastSeq_;
+    return prevMissing(storedSeqs_, 1 + lastSeq_, firstSeq_);
+}
+
+bool
+Shard::contains(std::uint32_t seq) const
+{
+    if (seq < firstSeq_ || seq > lastSeq_)
+        return false;
+    if (complete_)
+        return true;
+    return boost::icl::contains(storedSeqs_, seq);
+}
+
+void
+Shard::validate(Application& app)
+{
+    uint256 hash;
+    std::uint32_t seq;
+    std::shared_ptr<Ledger> l;
+    // Find the hash of the last ledger in this shard
+    {
+        std::tie(l, seq, hash) = loadLedgerHelper(
+            "WHERE LedgerSeq >= " + std::to_string(lastSeq_) +
+            " order by LedgerSeq desc limit 1", app);
+        if (!l)
+        {
+            JLOG(j_.fatal()) <<
+                "shard " << index_ <<
+                " unable to validate. No lookup data";
+            return;
+        }
+        if (seq != lastSeq_)
+        {
+            l->setImmutable(app.config());
+            boost::optional<uint256> h;
+            try
+            {
+                h = hashOfSeq(*l, lastSeq_, j_);
+            }
+            catch (std::exception const& e)
+            {
+                JLOG(j_.fatal()) <<
+                    "exception: " << e.what();
+                return;
+            }
+            if (!h)
+            {
+                JLOG(j_.fatal()) <<
+                    "shard " << index_ <<
+                    " No hash for last ledger seq " << lastSeq_;
+                return;
+            }
+            hash = *h;
+            seq = lastSeq_;
+        }
+    }
+
+    JLOG(j_.fatal()) <<
+        "Validating shard " << index_ <<
+        " ledgers " << firstSeq_ <<
+        "-" << lastSeq_;
+
+    // Use a short age to keep memory consumption low
+    PCache::clock_type::rep const savedAge {pCache_->getTargetAge()};
+    pCache_->setTargetAge(1);
+
+    // Validate every ledger stored in this shard
+    std::shared_ptr<Ledger const> next;
+    while (seq >= firstSeq_)
+    {
+        auto nObj = valFetch(hash);
+        if (!nObj)
+            break;
+        l = std::make_shared<Ledger>(
+            InboundLedger::deserializeHeader(makeSlice(nObj->getData()),
+                true), app.config(), *app.shardFamily());
+        if (l->info().hash != hash || l->info().seq != seq)
+        {
+            JLOG(j_.fatal()) <<
+                "ledger seq " << seq <<
+                " hash " << hash <<
+                " cannot be a ledger";
+            break;
+        }
+        l->stateMap().setLedgerSeq(seq);
+        l->txMap().setLedgerSeq(seq);
+        l->setImmutable(app.config());
+        if (!l->stateMap().fetchRoot(
+            SHAMapHash {l->info().accountHash}, nullptr))
+        {
+            JLOG(j_.fatal()) <<
+                "ledger seq " << seq <<
+                " missing Account State root";
+            break;
+        }
+        if (l->info().txHash.isNonZero())
+        {
+            if (!l->txMap().fetchRoot(
+                SHAMapHash {l->info().txHash}, nullptr))
+            {
+                JLOG(j_.fatal()) <<
+                    "ledger seq " << seq <<
+                    " missing TX root";
+                break;
+            }
+        }
+        if (!valLedger(l, next))
+            break;
+        hash = l->info().parentHash;
+        --seq;
+        next = l;
+        if (seq % 128 == 0)
+            pCache_->sweep();
+    }
+    if (seq < firstSeq_)
+    {
+        JLOG(j_.fatal()) <<
+            "shard " << index_ <<
+            " is complete.";
+    }
+    else if (complete_)
+    {
+        JLOG(j_.fatal()) <<
+            "shard " << index_ <<
+            " is invalid, failed on seq " << seq <<
+            " hash " << hash;
+    }
+    else
+    {
+        JLOG(j_.fatal()) <<
+            "shard " << index_ <<
+            " is incomplete, stopped at seq " << seq <<
+            " hash " << hash;
+    }
+
+    pCache_->reset();
+    nCache_->reset();
+    pCache_->setTargetAge(savedAge);
+}
+
+bool
+Shard::valLedger(std::shared_ptr<Ledger const> const& l,
+    std::shared_ptr<Ledger const> const& next)
+{
+    if (l->info().hash.isZero() || l->info().accountHash.isZero())
+    {
+        JLOG(j_.fatal()) <<
+            "invalid ledger";
+        return false;
+    }
+    bool error {false};
+    auto f = [&, this](SHAMapAbstractNode& node) {
+        if (!valFetch(node.getNodeHash().as_uint256()))
+            error = true;
+        return !error;
+    };
+    // Validate the state map
+    if (l->stateMap().getHash().isNonZero())
+    {
+        if (!l->stateMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid state map";
+            return false;
+        }
+        try
+        {
+            if (next && next->info().parentHash == l->info().hash)
+                l->stateMap().visitDifferences(&next->stateMap(), f);
+            else
+                l->stateMap().visitNodes(f);
+        }
+        catch (std::exception const& e)
+        {
+            JLOG(j_.fatal()) <<
+                "exception: " << e.what();
+            return false;
+        }
+        if (error)
+            return false;
+    }
+    // Validate the tx map
+    if (l->info().txHash.isNonZero())
+    {
+        if (!l->txMap().isValid())
+        {
+            JLOG(j_.error()) <<
+                "invalid transaction map";
+            return false;
+        }
+        try
+        {
+            l->txMap().visitNodes(f);
+        }
+        catch (std::exception const& e)
+        {
+            JLOG(j_.fatal()) <<
+                "exception: " << e.what();
+            return false;
+        }
+        if (error)
+            return false;
+    }
+    return true;
+};
+
+std::shared_ptr<NodeObject>
+Shard::valFetch(uint256 const& hash)
+{
+    assert(backend_);
+    std::shared_ptr<NodeObject> nObj;
+    try
+    {
+        switch (backend_->fetch(hash.begin(), &nObj))
+        {
+        case ok:
+            break;
+        case notFound:
+        {
+            JLOG(j_.fatal()) <<
+                "NodeObject not found. hash " << hash;
+            break;
+        }
+        case dataCorrupt:
+        {
+            JLOG(j_.fatal()) <<
+                "NodeObject is corrupt. hash " << hash;
+            break;
+        }
+        default:
+        {
+            JLOG(j_.fatal()) <<
+                "unknown error. hash " << hash;
+        }
+        }
+    }
+    catch (std::exception const& e)
+    {
+        JLOG(j_.fatal()) <<
+            "exception: " << e.what();
+    }
+    return nObj;
+}
+
+void
+Shard::updateFileSize()
+{
+    fileSize_ = 0;
+    using namespace boost::filesystem;
+    for (auto const& d : directory_iterator(dir_))
+        if (is_regular_file(d))
+            fileSize_ += file_size(d);
+}
+
+bool
+Shard::saveControl()
+{
+    std::ofstream ofs {control_.string(), std::ios::trunc};
+    if (!ofs.is_open())
+    {
+        JLOG(j_.fatal()) <<
+            "shard " << index_ <<
+            " unable to save control file";
+        return false;
+    }
+    boost::archive::text_oarchive ar(ofs);
+    ar & storedSeqs_;
+    return true;
+}
+
+} // NodeStore
+} // ripple
--- a/src/ripple/nodestore/impl/Shard.h
+++ b/src/ripple/nodestore/impl/Shard.h
@@ -0,0 +1,168 @@
+//------------------------------------------------------------------------------
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright (c) 2012, 2017 Ripple Labs Inc.
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+//==============================================================================
+
+#ifndef RIPPLE_NODESTORE_SHARD_H_INCLUDED
+#define RIPPLE_NODESTORE_SHARD_H_INCLUDED
+
+#include <ripple/app/ledger/Ledger.h>
+#include <ripple/basics/BasicConfig.h>
+#include <ripple/basics/RangeSet.h>
+#include <ripple/nodestore/NodeObject.h>
+#include <ripple/nodestore/Scheduler.h>
+
+#include <boost/filesystem.hpp>
+#include <boost/serialization/map.hpp>
+#include <boost/archive/text_oarchive.hpp>
+#include <boost/archive/text_iarchive.hpp>
+
+namespace ripple {
+namespace NodeStore {
+
+using PCache = TaggedCache<uint256, NodeObject>;
+using NCache = KeyCache<uint256>;
+
+/* A range of historical ledgers backed by a nodestore.
+   Shards are indexed and store `ledgersPerShard`.
+   Shard `i` stores ledgers starting with sequence: `1 + (i * ledgersPerShard)`
+   and ending with sequence: `(i + 1) * ledgersPerShard`.
+   Once a shard has all its ledgers, it is never written to again.
+*/
+class Shard
+{
+public:
+    Shard(std::uint32_t index, int cacheSz,
+        PCache::clock_type::rep cacheAge,
+        beast::Journal& j);
+
+    bool
+    open(Section config, Scheduler& scheduler,
+        boost::filesystem::path dir);
+
+    bool
+    setStored(std::shared_ptr<Ledger const> const& l);
+
+    boost::optional<std::uint32_t>
+    prepare();
+
+    bool
+    contains(std::uint32_t seq) const;
+
+    void
+    validate(Application& app);
+
+    std::uint32_t
+    index() const {return index_;}
+
+    bool
+    complete() const {return complete_;}
+
+    std::shared_ptr<PCache>&
+    pCache() {return pCache_;}
+
+    std::shared_ptr<NCache>&
+    nCache() {return nCache_;}
+
+    std::uint64_t
+    fileSize() const {return fileSize_;}
+
+    std::shared_ptr<Backend> const&
+    getBackend() const
+    {
+        assert(backend_);
+        return backend_;
+    }
+
+    std::uint32_t
+    fdlimit() const
+    {
+        assert(backend_);
+        return backend_->fdlimit();
+    }
+
+    std::shared_ptr<Ledger const>
+    lastStored() {return lastStored_;}
+
+private:
+    friend class boost::serialization::access;
+    template<class Archive>
+    void serialize(Archive & ar, const unsigned int version)
+    {
+        ar & storedSeqs_;
+    }
+
+    static constexpr auto controlFileName = "control.txt";
+
+    // Shard Index
+    std::uint32_t const index_;
+
+    // First ledger sequence in this shard
+    std::uint32_t const firstSeq_;
+
+    // Last ledger sequence in this shard
+    std::uint32_t const lastSeq_;
+
+    // Database positive cache
+    std::shared_ptr<PCache> pCache_;
+
+    // Database negative cache
+    std::shared_ptr<NCache> nCache_;
+
+    std::uint64_t fileSize_ {0};
+    std::shared_ptr<Backend> backend_;
+    beast::Journal j_;
+
+    // Path to database files
+    boost::filesystem::path dir_;
+
+    // True if shard has its entire ledger range stored
+    bool complete_ {false};
+
+    // Sequences of ledgers stored with an incomplete shard
+    RangeSet<std::uint32_t> storedSeqs_;
+
+    // Path to control file
+    boost::filesystem::path control_;
+
+    // Used as an optimization for visitDifferences
+    std::shared_ptr<Ledger const> lastStored_;
+
+    // Validate this ledger by walking its SHAMaps
+    // and verifying each merkle tree
+    bool
+    valLedger(std::shared_ptr<Ledger const> const& l,
+        std::shared_ptr<Ledger const> const& next);
+
+    // Fetches from the backend and will log
+    // errors based on status codes
+    std::shared_ptr<NodeObject>
+    valFetch(uint256 const& hash);
+
+    // Calculate the file foot print of the backend files
+    void
+    updateFileSize();
+
+    // Save the control file for an incomplete shard
+    bool
+    saveControl();
+};
+
+} // NodeStore
+} // ripple
+
+#endif
--- a/src/ripple/nodestore/impl/Tuning.h
+++ b/src/ripple/nodestore/impl/Tuning.h
@@ -35,6 +35,9 @@ enum
    ,asyncDivider = 8
 };

+auto constexpr shardCacheSz = 16384;
+auto constexpr shardCacheSeconds = 60;
+
 }
 }

--- a/src/ripple/overlay/Peer.h
+++ b/src/ripple/overlay/Peer.h
@@ -99,6 +99,8 @@ public:
    virtual uint256 const& getClosedLedgerHash () const = 0;
    virtual bool hasLedger (uint256 const& hash, std::uint32_t seq) const = 0;
    virtual void ledgerRange (std::uint32_t& minSeq, std::uint32_t& maxSeq) const = 0;
+    virtual bool hasShard (std::uint32_t shardIndex) const = 0;
+    virtual std::string getShards() const = 0;
    virtual bool hasTxSet (uint256 const& hash) const = 0;
    virtual void cycleStatus () = 0;
    virtual bool supportsVersion (int version) = 0;
--- a/src/ripple/overlay/impl/OverlayImpl.cpp
+++ b/src/ripple/overlay/impl/OverlayImpl.cpp
@@ -18,12 +18,14 @@
 //==============================================================================

 #include <BeastConfig.h>
+#include <ripple/app/ledger/LedgerMaster.h>
 #include <ripple/app/misc/HashRouter.h>
 #include <ripple/app/misc/NetworkOPs.h>
 #include <ripple/app/misc/ValidatorList.h>
 #include <ripple/basics/make_SSLContext.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/core/DatabaseCon.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/overlay/Cluster.h>
 #include <ripple/overlay/predicates.h>
 #include <ripple/overlay/impl/ConnectAttempt.h>
@@ -784,8 +786,19 @@ OverlayImpl::crawl()
            }
        }
        auto version = sp->getVersion ();
-        if (!version.empty ())
-            pv["version"] = version;
+        if (! version.empty ())
+            pv[jss::version] = version;
+
+        std::uint32_t minSeq, maxSeq;
+        sp->ledgerRange(minSeq, maxSeq);
+        if (minSeq != 0 || maxSeq != 0)
+            pv[jss::complete_ledgers] =
+                std::to_string(minSeq) + "-" +
+                    std::to_string(maxSeq);
+
+        auto shards = sp->getShards();
+        if (! shards.empty())
+            pv[jss::complete_shards] = shards;
    });

    return jv;
--- a/src/ripple/overlay/impl/PeerImp.cpp
+++ b/src/ripple/overlay/impl/PeerImp.cpp
@@ -32,11 +32,14 @@
 #include <ripple/app/tx/apply.h>
 #include <ripple/basics/random.h>
 #include <ripple/basics/UptimeTimer.h>
+#include <ripple/beast/core/LexicalCast.h>
 #include <ripple/beast/core/SemanticVersion.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/overlay/Cluster.h>
 #include <ripple/protocol/digest.h>

 #include <boost/algorithm/string/predicate.hpp>
+#include <boost/algorithm/string.hpp>
 #include <algorithm>
 #include <memory>
 #include <sstream>
@@ -298,8 +301,8 @@ PeerImp::json()
    ledgerRange(minSeq, maxSeq);

    if ((minSeq != 0) || (maxSeq != 0))
-        ret[jss::complete_ledgers] = boost::lexical_cast<std::string>(minSeq) +
-            " - " + boost::lexical_cast<std::string>(maxSeq);
+        ret[jss::complete_ledgers] = std::to_string(minSeq) +
+            " - " + std::to_string(maxSeq);

    if (closedLedgerHash_ != zero)
        ret[jss::ledger] = to_string (closedLedgerHash_);
@@ -362,8 +365,11 @@ PeerImp::hasLedger (uint256 const& hash, std::uint32_t seq) const
    if ((seq != 0) && (seq >= minLedger_) && (seq <= maxLedger_) &&
            (sanity_.load() == Sanity::sane))
        return true;
-    return std::find (recentLedgers_.begin(),
-        recentLedgers_.end(), hash) != recentLedgers_.end();
+    if (std::find(recentLedgers_.begin(),
+            recentLedgers_.end(), hash) != recentLedgers_.end())
+        return true;
+    return seq != 0 && boost::icl::contains(
+        shards_, NodeStore::DatabaseShard::seqToShardIndex(seq));
 }

 void
@@ -376,6 +382,24 @@ PeerImp::ledgerRange (std::uint32_t& minSeq,
    maxSeq = maxLedger_;
 }

+bool
+PeerImp::hasShard (std::uint32_t shardIndex) const
+{
+    std::lock_guard<std::mutex> sl(recentLock_);
+    return boost::icl::contains(shards_, shardIndex);
+}
+
+std::string
+PeerImp::getShards () const
+{
+    {
+        std::lock_guard<std::mutex> sl(recentLock_);
+        if (!shards_.empty())
+            return to_string(shards_);
+    }
+    return {};
+}
+
 bool
 PeerImp::hasTxSet (uint256 const& hash) const
 {
@@ -1355,6 +1379,25 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMStatusChange> const& m)
            minLedger_ = 0;
    }

+    if (m->has_shardseqs())
+    {
+        shards_.clear();
+        std::vector<std::string> tokens;
+        boost::split(tokens, m->shardseqs(), boost::algorithm::is_any_of(","));
+        for (auto const& t : tokens)
+        {
+            std::vector<std::string> seqs;
+            boost::split(seqs, t, boost::algorithm::is_any_of("-"));
+            if (seqs.size() == 1)
+                shards_.insert(
+                    beast::lexicalCastThrow<std::uint32_t>(seqs.front()));
+            else if (seqs.size() == 2)
+                shards_.insert(range(
+                    beast::lexicalCastThrow<std::uint32_t>(seqs.front()),
+                        beast::lexicalCastThrow<std::uint32_t>(seqs.back())));
+        }
+    }
+
    if (m->has_ledgerseq() &&
        app_.getLedgerMaster().getValidatedLedgerAge() < 2min)
    {
@@ -1430,6 +1473,9 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMStatusChange> const& m)
                    Json::UInt (m->lastseq ());
            }

+            if (m->has_shardseqs())
+                j[jss::complete_shards] = m->shardseqs();
+
            return j;
        });
 }
@@ -1638,7 +1684,6 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMGetObjectByHash> const& m)
            return;
        }

-
        if (packet.type () == protocol::TMGetObjectByHash::otFETCH_PACK)
        {
            doFetchPack (m);
@@ -1651,8 +1696,8 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMGetObjectByHash> const& m)

        reply.set_query (false);

-        if (packet.has_seq ())
-            reply.set_seq (packet.seq ());
+        if (packet.has_seq())
+            reply.set_seq(packet.seq());

        reply.set_type (packet.type ());

@@ -1662,17 +1707,20 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMGetObjectByHash> const& m)
        // This is a very minimal implementation
        for (int i = 0; i < packet.objects_size (); ++i)
        {
-            uint256 hash;
-            const protocol::TMIndexedObject& obj = packet.objects (i);
-
+            auto const& obj = packet.objects (i);
            if (obj.has_hash () && (obj.hash ().size () == (256 / 8)))
            {
+                uint256 hash;
                memcpy (hash.begin (), obj.hash ().data (), 256 / 8);
                // VFALCO TODO Move this someplace more sensible so we dont
                //             need to inject the NodeStore interfaces.
-                std::shared_ptr<NodeObject> hObj =
-                    app_.getNodeStore ().fetch (hash);
-
+                std::uint32_t seq {obj.has_ledgerseq() ? obj.ledgerseq() : 0};
+                auto hObj {app_.getNodeStore ().fetch (hash, seq)};
+                if (!hObj && seq >= NodeStore::genesisSeq)
+                {
+                    if (auto shardStore = app_.getShardStore())
+                        hObj = shardStore->fetch(hash, seq);
+                }
                if (hObj)
                {
                    protocol::TMIndexedObject& newObj = *reply.add_objects ();
@@ -1682,6 +1730,8 @@ PeerImp::onMessage (std::shared_ptr <protocol::TMGetObjectByHash> const& m)

                    if (obj.has_nodeid ())
                        newObj.set_index (obj.nodeid ());
+                    if (obj.has_ledgerseq())
+                        newObj.set_ledgerseq(obj.ledgerseq());

                    // VFALCO NOTE "seq" in the message is obsolete
                }
@@ -2019,6 +2069,7 @@ PeerImp::getLedger (std::shared_ptr<protocol::TMGetLedger> const& m)
    SHAMap const* map = nullptr;
    protocol::TMLedgerData reply;
    bool fatLeaves = true;
+    std::shared_ptr<Ledger const> ledger;

    if (packet.has_requestcookie ())
        reply.set_requestcookie (packet.requestcookie ());
@@ -2089,7 +2140,6 @@ PeerImp::getLedger (std::shared_ptr<protocol::TMGetLedger> const& m)

        // Figure out what ledger they want
        JLOG(p_journal_.trace()) << "GetLedger: Received";
-        std::shared_ptr<Ledger const> ledger;

        if (packet.has_ledgerhash ())
        {
@@ -2116,23 +2166,31 @@ PeerImp::getLedger (std::shared_ptr<protocol::TMGetLedger> const& m)
                !packet.has_requestcookie ()))
            {
                std::uint32_t seq = 0;
-
-                if (packet.has_ledgerseq ())
-                    seq = packet.ledgerseq ();
-
-                auto const v = getPeerWithLedger(
-                    overlay_, ledgerhash, seq, this);
-                if (! v)
+                if (packet.has_ledgerseq())
                {
-                    JLOG(p_journal_.trace()) << "GetLedger: Cannot route";
+                    seq = packet.ledgerseq();
+                    if (seq >= NodeStore::genesisSeq)
+                    {
+                        if (auto shardStore = app_.getShardStore())
+                            ledger = shardStore->fetchLedger(ledgerhash, seq);
+                    }
+                }
+                if (! ledger)
+                {
+                    auto const v = getPeerWithLedger(
+                        overlay_, ledgerhash, seq, this);
+                    if (! v)
+                    {
+                        JLOG(p_journal_.trace()) << "GetLedger: Cannot route";
+                        return;
+                    }
+
+                    packet.set_requestcookie (id ());
+                    v->send (std::make_shared<Message>(
+                        packet, protocol::mtGET_LEDGER));
+                    JLOG(p_journal_.debug()) << "GetLedger: Request routed";
                    return;
                }
-
-                packet.set_requestcookie (id ());
-                v->send (std::make_shared<Message>(
-                    packet, protocol::mtGET_LEDGER));
-                JLOG(p_journal_.debug()) << "GetLedger: Request routed";
-                return;
            }
        }
        else if (packet.has_ledgerseq ())
@@ -2284,9 +2342,7 @@ PeerImp::getLedger (std::shared_ptr<protocol::TMGetLedger> const& m)

        try
        {
-            // We are guaranteed that map is non-null, but we need to check
-            // to keep the compiler happy.
-            if (map && map->getNodeFat (mn, nodeIDs, rawNodes, fatLeaves, depth))
+            if (map->getNodeFat(mn, nodeIDs, rawNodes, fatLeaves, depth))
            {
                assert (nodeIDs.size () == rawNodes.size ());
                JLOG(p_journal_.trace()) <<
@@ -2308,8 +2364,10 @@ PeerImp::getLedger (std::shared_ptr<protocol::TMGetLedger> const& m)
                }
            }
            else
+            {
                JLOG(p_journal_.warn()) <<
                    "GetLedger: getNodeFat returns false";
+            }
        }
        catch (std::exception&)
        {
--- a/src/ripple/overlay/impl/PeerImp.h
+++ b/src/ripple/overlay/impl/PeerImp.h
@@ -24,6 +24,7 @@
 #include <ripple/basics/Log.h>
 #include <ripple/beast/core/ByteOrder.h>
 #include <ripple/beast/utility/WrappedSink.h>
+#include <ripple/basics/RangeSet.h>
 #include <ripple/overlay/impl/ProtocolMessage.h>
 #include <ripple/overlay/impl/OverlayImpl.h>
 #include <ripple/protocol/Protocol.h>
@@ -125,6 +126,7 @@ private:
    //
    LedgerIndex minLedger_ = 0;
    LedgerIndex maxLedger_ = 0;
+    RangeSet<std::uint32_t> shards_;
    uint256 closedLedgerHash_;
    uint256 previousLedgerHash_;
    std::deque<uint256> recentLedgers_;
@@ -296,6 +298,12 @@ public:
    void
    ledgerRange (std::uint32_t& minSeq, std::uint32_t& maxSeq) const override;

+    bool
+    hasShard (std::uint32_t shardIndex) const override;
+
+    std::string
+    getShards () const override;
+
    bool
    hasTxSet (uint256 const& hash) const override;

--- a/src/ripple/proto/ripple.proto
+++ b/src/ripple/proto/ripple.proto
@@ -178,6 +178,7 @@ message TMStatusChange
    optional uint64 networkTime         = 6;
    optional uint32 firstSeq            = 7;
    optional uint32 lastSeq             = 8;
+    optional string shardSeqs           = 9;
 }


--- a/src/ripple/protocol/JsonFields.h
+++ b/src/ripple/protocol/JsonFields.h
@@ -122,6 +122,7 @@ JSS ( code );                       // out: errors
 JSS ( command );                    // in: RPCHandler
 JSS ( complete );                   // out: NetworkOPs, InboundLedger
 JSS ( complete_ledgers );           // out: NetworkOPs, PeerImp
+JSS ( complete_shards );            // out: OverlayImpl, PeerImp
 JSS ( consensus );                  // out: NetworkOPs, LedgerConsensus
 JSS ( converge_time );              // out: NetworkOPs
 JSS ( converge_time_s );            // out: NetworkOPs
@@ -384,6 +385,7 @@ JSS ( server_state );               // out: NetworkOPs
 JSS ( server_status );              // out: NetworkOPs
 JSS ( settle_delay );               // out: AccountChannels
 JSS ( severity );                   // in: LogLevel
+JSS ( shards );                     // out: GetCounts
 JSS ( signature );                  // out: NetworkOPs, ChannelAuthorize
 JSS ( signature_verified );         // out: ChannelVerify
 JSS ( signing_key );                // out: NetworkOPs
--- a/src/ripple/rpc/handlers/GetCounts.cpp
+++ b/src/ripple/rpc/handlers/GetCounts.cpp
@@ -29,6 +29,7 @@
 #include <ripple/ledger/CachedSLEs.h>
 #include <ripple/net/RPCErr.h>
 #include <ripple/nodestore/Database.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/protocol/ErrorCodes.h>
 #include <ripple/protocol/JsonFields.h>
 #include <ripple/rpc/Context.h>
@@ -125,6 +126,24 @@ Json::Value doGetCounts (RPC::Context& context)
    ret[jss::node_written_bytes] = context.app.getNodeStore().getStoreSize();
    ret[jss::node_read_bytes] = context.app.getNodeStore().getFetchSize();

+    if (auto shardStore = context.app.getShardStore())
+    {
+        Json::Value& jv = (ret[jss::shards] = Json::objectValue);
+        jv[jss::fullbelow_size] =
+            static_cast<int>(context.app.shardFamily()->fullbelow().size());
+        jv[jss::treenode_cache_size] =
+            context.app.shardFamily()->treecache().getCacheSize();
+        jv[jss::treenode_track_size] =
+            context.app.shardFamily()->treecache().getTrackSize();
+        ret[jss::write_load] = shardStore->getWriteLoad();
+        ret[jss::node_hit_rate] = shardStore->getCacheHitRate();
+        jv[jss::node_writes] = shardStore->getStoreCount();
+        jv[jss::node_reads_total] = shardStore->getFetchTotalCount();
+        jv[jss::node_reads_hit] = shardStore->getFetchHitCount();
+        jv[jss::node_written_bytes] = shardStore->getStoreSize();
+        jv[jss::node_read_bytes] = shardStore->getFetchSize();
+    }
+
    return ret;
 }

--- a/src/ripple/rpc/handlers/LedgerRequest.cpp
+++ b/src/ripple/rpc/handlers/LedgerRequest.cpp
@@ -94,7 +94,7 @@ Json::Value doLedgerRequest (RPC::Context& context)
                // they want. Try to get it.

                if (auto il = context.app.getInboundLedgers().acquire (
-                        *refHash, refIndex, InboundLedger::fcGENERIC))
+                        *refHash, refIndex, InboundLedger::Reason::GENERIC))
                {
                    Json::Value jvResult = RPC::make_error(
                        rpcLGR_NOT_FOUND,
@@ -125,7 +125,7 @@ Json::Value doLedgerRequest (RPC::Context& context)
    // Try to get the desired ledger
    // Verify all nodes even if we think we have it
    auto ledger = context.app.getInboundLedgers().acquire (
-        ledgerHash, ledgerIndex, InboundLedger::fcGENERIC);
+        ledgerHash, ledgerIndex, InboundLedger::Reason::GENERIC);

    // In standalone mode, accept the ledger from the ledger cache
    if (! ledger && context.app.config().standalone())
--- a/src/ripple/shamap/Family.h
+++ b/src/ripple/shamap/Family.h
@@ -62,13 +62,21 @@ public:
    NodeStore::Database const&
    db() const = 0;

+    virtual
+    bool
+    isShardBacked() const = 0;
+
    virtual
    void
    missing_node (std::uint32_t refNum) = 0;

    virtual
    void
-    missing_node (uint256 const& refHash) = 0;
+    missing_node (uint256 const& refHash, std::uint32_t refNum) = 0;
+
+    virtual
+    void
+    reset () = 0;
 };

 } // ripple
--- a/src/ripple/shamap/FullBelowCache.h
+++ b/src/ripple/shamap/FullBelowCache.h
@@ -127,6 +127,12 @@ public:
        ++m_gen;
    }

+    void reset ()
+    {
+        m_cache.clear();
+        m_gen  = 1;
+    }
+
 private:
    KeyCache <Key> m_cache;
    std::atomic <std::uint32_t> m_gen;
--- a/src/ripple/shamap/SHAMap.h
+++ b/src/ripple/shamap/SHAMap.h
@@ -87,6 +87,7 @@ private:
    mutable SHAMapState             state_;
    SHAMapType                      type_;
    bool                            backed_ = true; // Map is backed by the database
+    bool                            full_ = false; // Map is believed complete in database

 public:
    class version
@@ -122,6 +123,12 @@ public:
        Family& f,
        version v);

+    Family const&
+    family() const
+    {
+        return f_;
+    }
+
    Family&
    family()
    {
@@ -145,11 +152,12 @@ public:
    // Handles copy on write for mutable snapshots.
    std::shared_ptr<SHAMap> snapShot (bool isMutable) const;

-    /*  Sets metadata associated with the SHAMap
-
-        Marked `const` because the data is not part of
-        the map contents.
+    /*  Mark this SHAMap as "should be full", indicating
+        that the local server wants all the corresponding nodes
+        in durable storage.
    */
+    void setFull ();
+
    void setLedgerSeq (std::uint32_t lseq);

    bool fetchRoot (SHAMapHash const& hash, SHAMapSyncFilter * filter);
@@ -399,6 +407,13 @@ private:
    void gmn_ProcessDeferredReads (MissingNodes&);
 };

+inline
+void
+SHAMap::setFull ()
+{
+    full_ = true;
+}
+
 inline
 void
 SHAMap::setLedgerSeq (std::uint32_t lseq)
--- a/src/ripple/shamap/SHAMapSyncFilter.h
+++ b/src/ripple/shamap/SHAMapSyncFilter.h
@@ -22,6 +22,7 @@

 #include <ripple/shamap/SHAMapNodeID.h>
 #include <ripple/shamap/SHAMapTreeNode.h>
+#include <boost/optional.hpp>

 /** Callback for filtering SHAMap during sync. */
 namespace ripple {
@@ -35,10 +36,11 @@ public:
    SHAMapSyncFilter& operator=(SHAMapSyncFilter const&) = delete;

    // Note that the nodeData is overwritten by this call
-    virtual void gotNode (bool fromFilter,
-                          SHAMapHash const& nodeHash,
-                          Blob&& nodeData,
-                          SHAMapTreeNode::TNType type) const = 0;
+    virtual
+    void
+    gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+        std::uint32_t ledgerSeq, Blob&& nodeData,
+            SHAMapTreeNode::TNType type) const = 0;

    virtual
    boost::optional<Blob>
--- a/src/ripple/shamap/impl/SHAMap.cpp
+++ b/src/ripple/shamap/impl/SHAMap.cpp
@@ -71,6 +71,7 @@ SHAMap::snapShot (bool isMutable) const
        newMap.state_ = SHAMapState::Immutable;

    newMap.seq_ = seq_ + 1;
+    newMap.ledgerSeq_ = ledgerSeq_;
    newMap.root_ = root_;
    newMap.backed_ = backed_;

@@ -247,8 +248,7 @@ SHAMap::fetchNodeFromDB (SHAMapHash const& hash) const

    if (backed_)
    {
-        std::shared_ptr<NodeObject> obj = f_.db().fetch (hash.as_uint256());
-        if (obj)
+        if (auto obj = f_.db().fetch(hash.as_uint256(), ledgerSeq_))
        {
            try
            {
@@ -284,10 +284,10 @@ SHAMap::fetchNodeFromDB (SHAMapHash const& hash) const
                return std::shared_ptr<SHAMapTreeNode> ();
            }
        }
-        else if (ledgerSeq_ != 0)
+        else if (full_)
        {
            f_.missing_node(ledgerSeq_);
-            const_cast<std::uint32_t&>(ledgerSeq_) = 0;
+            const_cast<bool&>(full_) = false;
        }
    }

@@ -306,7 +306,7 @@ SHAMap::checkFilter(SHAMapHash const& hash,
            makeSlice(*nodeData), 0, snfPREFIX, hash, true, f_.journal ());
        if (node)
        {
-            filter->gotNode (true, hash,
+            filter->gotNode (true, hash, ledgerSeq_,
                std::move(*nodeData), node->getType ());
            if (backed_)
                canonicalize (hash, node);
@@ -482,7 +482,7 @@ SHAMap::descendAsync (SHAMapInnerNode* parent, int branch,
        if (!ptr && backed_)
        {
            std::shared_ptr<NodeObject> obj;
-            if (! f_.db().asyncFetch (hash.as_uint256(), obj))
+            if (! f_.db().asyncFetch (hash.as_uint256(), ledgerSeq_, obj))
            {
                pending = true;
                return nullptr;
@@ -1117,8 +1117,8 @@ SHAMap::writeNode (

    Serializer s;
    node->addRaw (s, snfPREFIX);
-    f_.db().store (t,
-        std::move (s.modData ()), node->getNodeHash ().as_uint256());
+    f_.db().store (t, std::move (s.modData ()),
+        node->getNodeHash ().as_uint256(), ledgerSeq_);
    return node;
 }

--- a/src/ripple/shamap/impl/SHAMapSync.cpp
+++ b/src/ripple/shamap/impl/SHAMapSync.cpp
@@ -44,12 +44,12 @@ SHAMap::visitNodes(std::function<bool (
    // Visit every node in a SHAMap
    assert (root_->isValid ());

-    if (!root_)
+    if (! root_)
        return;

    function (*root_);

-    if (!root_->isInner ())
+    if (! root_->isInner ())
        return;

    using StackEntry = std::pair <int, std::shared_ptr<SHAMapInnerNode>>;
@@ -63,7 +63,7 @@ SHAMap::visitNodes(std::function<bool (
        while (pos < 16)
        {
            uint256 childHash;
-            if (!node->isEmptyBranch (pos))
+            if (! node->isEmptyBranch (pos))
            {
                std::shared_ptr<SHAMapAbstractNode> child = descendNoStore (node, pos);
                if (! function (*child))
@@ -123,8 +123,7 @@ SHAMap::visitDifferences(SHAMap const* have,
    {
        auto leaf = std::static_pointer_cast<SHAMapTreeNode>(root_);
        if (! have || ! have->hasLeafNode(leaf->peekItem()->key(), leaf->getNodeHash()))
-            function(*root_);
-
+            function (*root_);
        return;
    }
    // contains unexplored non-matching inner node entries
@@ -141,7 +140,7 @@ SHAMap::visitDifferences(SHAMap const* have,
        stack.pop ();

        // 1) Add this node to the pack
-        if (! function(*node))
+        if (! function (*node))
            return;

        // 2) push non-matching child inner nodes
@@ -162,7 +161,7 @@ SHAMap::visitDifferences(SHAMap const* have,
                         static_cast<SHAMapTreeNode*>(next)->peekItem()->key(),
                         childHash))
                {
-                    if (! function(*next))
+                    if (! function (*next))
                        return;
                }
            }
@@ -318,7 +317,7 @@ SHAMap::getMissingNodes(int max, SHAMapSyncFilter* filter)
    assert (max > 0);

    MissingNodes mn (max, filter,
-        f_.db().getDesiredAsyncReadCount(),
+        f_.db().getDesiredAsyncReadCount(ledgerSeq_),
        f_.fullbelow().getGeneration());

    if (! root_->isInner () ||
@@ -568,7 +567,7 @@ SHAMapAddNode SHAMap::addRootNode (SHAMapHash const& hash, Slice const& rootNode
    {
        Serializer s;
        root_->addRaw (s, snfPREFIX);
-        filter->gotNode (false, root_->getNodeHash (),
+        filter->gotNode (false, root_->getNodeHash (), ledgerSeq_,
                         std::move(s.modData ()), root_->getType ());
    }

@@ -656,7 +655,7 @@ SHAMap::addKnownNode (const SHAMapNodeID& node, Slice const& rawNode,
            {
                Serializer s;
                newNode->addRaw (s, snfPREFIX);
-                filter->gotNode (false, childHash,
+                filter->gotNode (false, childHash, ledgerSeq_,
                                 std::move(s.modData ()), newNode->getType ());
            }

--- a/src/ripple/unity/nodestore.cpp
+++ b/src/ripple/unity/nodestore.cpp
@@ -26,11 +26,13 @@
 #include <ripple/nodestore/backend/RocksDBQuickFactory.cpp>

 #include <ripple/nodestore/impl/BatchWriter.cpp>
-#include <ripple/nodestore/impl/DatabaseImp.h>
+#include <ripple/nodestore/impl/Database.cpp>
+#include <ripple/nodestore/impl/DatabaseNodeImp.cpp>
 #include <ripple/nodestore/impl/DatabaseRotatingImp.cpp>
+#include <ripple/nodestore/impl/DatabaseShardImp.cpp>
 #include <ripple/nodestore/impl/DummyScheduler.cpp>
 #include <ripple/nodestore/impl/DecodedBlob.cpp>
 #include <ripple/nodestore/impl/EncodedBlob.cpp>
 #include <ripple/nodestore/impl/ManagerImp.cpp>
 #include <ripple/nodestore/impl/NodeObject.cpp>
-
+#include <ripple/nodestore/impl/Shard.cpp>
--- a/src/test/nodestore/Backend_test.cpp
+++ b/src/test/nodestore/Backend_test.cpp
@@ -59,6 +59,7 @@ public:
            // Open the backend
            std::unique_ptr <Backend> backend =
                Manager::instance().make_Backend (params, scheduler, j);
+            backend->open();

            // Write the batch
            storeBatch (*backend, batch);
@@ -86,6 +87,7 @@ public:
            // Re-open the backend
            std::unique_ptr <Backend> backend = Manager::instance().make_Backend (
                params, scheduler, j);
+            backend->open();

            // Read it back in
            Batch copy;
--- a/src/test/nodestore/TestBase.h
+++ b/src/test/nodestore/TestBase.h
@@ -26,6 +26,8 @@
 #include <ripple/beast/unit_test.h>
 #include <ripple/beast/utility/rngfill.h>
 #include <ripple/beast/xor_shift_engine.h>
+#include <ripple/nodestore/Backend.h>
+#include <ripple/nodestore/Types.h>
 #include <boost/algorithm/string.hpp>
 #include <iomanip>

@@ -192,7 +194,8 @@ public:

            db.store (object->getType (),
                      std::move (data),
-                      object->getHash ());
+                      object->getHash (),
+                      NodeStore::genesisSeq);
        }
    }

@@ -206,7 +209,8 @@ public:

        for (int i = 0; i < batch.size (); ++i)
        {
-            std::shared_ptr<NodeObject> object = db.fetch (batch [i]->getHash ());
+            std::shared_ptr<NodeObject> object = db.fetch (
+                batch [i]->getHash (), 0);

            if (object != nullptr)
                pCopy->push_back (object);
--- a/src/test/nodestore/Timing_test.cpp
+++ b/src/test/nodestore/Timing_test.cpp
@@ -276,6 +276,7 @@ public:
        DummyScheduler scheduler;
        auto backend = make_Backend (config, scheduler, journal);
        BEAST_EXPECT(backend != nullptr);
+        backend->open();

        class Body
        {
@@ -330,6 +331,7 @@ public:
        DummyScheduler scheduler;
        auto backend = make_Backend (config, scheduler, journal);
        BEAST_EXPECT(backend != nullptr);
+        backend->open();

        class Body
        {
@@ -391,6 +393,7 @@ public:
        DummyScheduler scheduler;
        auto backend = make_Backend (config, scheduler, journal);
        BEAST_EXPECT(backend != nullptr);
+        backend->open();

        class Body
        {
@@ -454,6 +457,7 @@ public:
        DummyScheduler scheduler;
        auto backend = make_Backend (config, scheduler, journal);
        BEAST_EXPECT(backend != nullptr);
+        backend->open();

        class Body
        {
@@ -535,8 +539,9 @@ public:
        beast::Journal journal;
        DummyScheduler scheduler;
        auto backend = make_Backend (config, scheduler, journal);
-        backend->setDeletePath();
        BEAST_EXPECT(backend != nullptr);
+        backend->setDeletePath();
+        backend->open();

        class Body
        {
--- a/src/test/shamap/FetchPack_test.cpp
+++ b/src/test/shamap/FetchPack_test.cpp
@@ -60,9 +60,10 @@ public:
        {
        }

-        void gotNode (bool fromFilter,
-            SHAMapHash const& nodeHash,
-                Blob&& nodeData, SHAMapTreeNode::TNType type) const override
+        void
+        gotNode(bool fromFilter, SHAMapHash const& nodeHash,
+            std::uint32_t ledgerSeq, Blob&& nodeData,
+                SHAMapTreeNode::TNType type) const override
        {
        }

--- a/src/test/shamap/common.h
+++ b/src/test/shamap/common.h
@@ -22,6 +22,7 @@

 #include <BeastConfig.h>
 #include <ripple/basics/chrono.h>
+#include <ripple/nodestore/DatabaseShard.h>
 #include <ripple/nodestore/DummyScheduler.h>
 #include <ripple/nodestore/Manager.h>
 #include <ripple/shamap/Family.h>
@@ -38,6 +39,7 @@ private:
    FullBelowCache fullbelow_;
    RootStoppable parent_;
    std::unique_ptr<NodeStore::Database> db_;
+    bool shardBacked_;
    beast::Journal j_;

 public:
@@ -52,6 +54,8 @@ public:
        testSection.set("Path", "SHAMap_test");
        db_ = NodeStore::Manager::instance ().make_Database (
            "test", scheduler_, 1, parent_, testSection, j);
+        shardBacked_ =
+            dynamic_cast<NodeStore::DatabaseShard*>(db_.get()) != nullptr;
    }

    beast::manual_clock <std::chrono::steady_clock>
@@ -102,6 +106,12 @@ public:
        return *db_;
    }

+    bool
+    isShardBacked() const override
+    {
+        return shardBacked_;
+    }
+
    void
    missing_node (std::uint32_t refNum) override
    {
@@ -109,10 +119,17 @@ public:
    }

    void
-    missing_node (uint256 const& refHash) override
+    missing_node (uint256 const& refHash, std::uint32_t refNum) override
    {
        Throw<std::runtime_error> ("missing node");
    }
+
+    void
+    reset() override
+    {
+        fullbelow_.reset();
+        treecache_.reset();
+    }
 };

 } // tests