rippled
import_test.cpp
1 //------------------------------------------------------------------------------
2 /*
3  This file is part of rippled: https://github.com/ripple/rippled
4  Copyright (c) 2012, 2013 Ripple Labs Inc.
5 
6  Permission to use, copy, modify, and/or distribute this software for any
7  purpose with or without fee is hereby granted, provided that the above
8  copyright notice and this permission notice appear in all copies.
9 
10  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 //==============================================================================
19 
20 #include <ripple/basics/contract.h>
21 #include <ripple/nodestore/impl/codec.h>
22 #include <ripple/beast/clock/basic_seconds_clock.h>
23 #include <ripple/beast/rfc2616.h>
24 #include <ripple/beast/core/LexicalCast.h>
25 #include <ripple/beast/unit_test.h>
26 #include <nudb/create.hpp>
27 #include <nudb/detail/format.hpp>
28 #include <nudb/xxhasher.hpp>
29 #include <boost/beast/core/string.hpp>
30 #include <boost/regex.hpp>
31 #include <algorithm>
32 #include <chrono>
33 #include <iomanip>
34 #include <map>
35 #include <sstream>
36 
37 #include <ripple/unity/rocksdb.h>
38 
39 /*
40 
41 Math:
42 
43 1000 gb dat file
44 170 gb key file
45 capacity 113 keys/bucket
46 
47 normal:
48 1,000gb data file read
49 19,210gb key file read (113 * 170)
50 19,210gb key file write
51 
52 multi(32gb):
53 6 passes (170/32)
54 6,000gb data file read
55 170gb key file write
56 
57 
58 */
59 
60 namespace ripple {
61 namespace NodeStore {
62 
63 namespace detail {
64 
66 {
69  std::ios::fmtflags flags_;
70  std::ios::char_type fill_;
71 public:
73  {
75  os_.flags(flags_);
76  os_.fill(fill_);
77  }
78  save_stream_state(save_stream_state const&) = delete;
81  : os_(os)
82  , precision_(os.precision())
83  , flags_(os.flags())
84  , fill_(os.fill())
85  {
86  }
87 };
88 
89 template <class Rep, class Period>
92 {
93  save_stream_state _(os);
94  using namespace std::chrono;
95  if (d < microseconds{1})
96  {
97  // use nanoseconds
98  if (d < nanoseconds{100})
99  {
100  // use floating
101  using ns = duration<float, std::nano>;
102  os << std::fixed << std::setprecision(1) << ns(d).count();
103  }
104  else
105  {
106  // use integral
107  os << date::round<nanoseconds>(d).count();
108  }
109  os << "ns";
110  }
111  else if (d < milliseconds{1})
112  {
113  // use microseconds
114  if (d < microseconds{100})
115  {
116  // use floating
117  using ms = duration<float, std::micro>;
118  os << std::fixed << std::setprecision(1) << ms(d).count();
119  }
120  else
121  {
122  // use integral
123  os << date::round<microseconds>(d).count();
124  }
125  os << "us";
126  }
127  else if (d < seconds{1})
128  {
129  // use milliseconds
130  if (d < milliseconds{100})
131  {
132  // use floating
133  using ms = duration<float, std::milli>;
134  os << std::fixed << std::setprecision(1) << ms(d).count();
135  }
136  else
137  {
138  // use integral
139  os << date::round<milliseconds>(d).count();
140  }
141  os << "ms";
142  }
143  else if (d < minutes{1})
144  {
145  // use seconds
146  if (d < seconds{100})
147  {
148  // use floating
149  using s = duration<float>;
150  os << std::fixed << std::setprecision(1) << s(d).count();
151  }
152  else
153  {
154  // use integral
155  os << date::round<seconds>(d).count();
156  }
157  os << "s";
158  }
159  else
160  {
161  // use minutes
162  if (d < minutes{100})
163  {
164  // use floating
166  os << std::fixed << std::setprecision(1) << m(d).count();
167  }
168  else
169  {
170  // use integral
171  os << date::round<minutes>(d).count();
172  }
173  os << "min";
174  }
175  return os;
176 }
177 
178 template <class Period, class Rep>
179 inline
182 {
184  pretty_time(ss, d);
185  return ss.str();
186 }
187 
188 } // detail
189 
190 //------------------------------------------------------------------------------
191 
192 class progress
193 {
194 private:
195  using clock_type =
198 
200  clock_type::time_point start_ = clock_type::now();
201  clock_type::time_point now_ = clock_type::now();
202  clock_type::time_point report_ = clock_type::now();
203  std::size_t prev_ = 0;
204  bool estimate_ = false;
205 
206 public:
207  explicit
209  : work_(work)
210  {
211  }
212 
213  template <class Log>
214  void
215  operator()(Log& log, std::size_t work)
216  {
217  using namespace std::chrono;
218  auto const now = clock_type::now();
219  if (now == now_)
220  return;
221  now_ = now;
222  auto const elapsed = now - start_;
223  if (! estimate_)
224  {
225  if (elapsed < seconds(15))
226  return;
227  estimate_ = true;
228  }
229  else if (now - report_ <
231  {
232  return;
233  }
234  auto const rate =
235  elapsed.count() / double(work);
236  clock_type::duration const remain(
237  static_cast<clock_type::duration::rep>(
238  (work_ - work) * rate));
239  log <<
240  "Remaining: " << detail::fmtdur(remain) <<
241  " (" << work << " of " << work_ <<
242  " in " << detail::fmtdur(elapsed) <<
243  ", " << (work - prev_) <<
244  " in " << detail::fmtdur(now - report_) <<
245  ")";
246  report_ = now;
247  prev_ = work;
248  }
249 
250  template <class Log>
251  void
252  finish(Log& log)
253  {
254  log <<
255  "Total time: " << detail::fmtdur(
256  clock_type::now() - start_);
257  }
258 };
259 
262 {
263  // <key> '=' <value>
264  static boost::regex const re1 (
265  "^" // start of line
266  "(?:\\s*)" // whitespace (optonal)
267  "([a-zA-Z][_a-zA-Z0-9]*)" // <key>
268  "(?:\\s*)" // whitespace (optional)
269  "(?:=)" // '='
270  "(?:\\s*)" // whitespace (optional)
271  "(.*\\S+)" // <value>
272  "(?:\\s*)" // whitespace (optional)
273  , boost::regex_constants::optimize
274  );
276  std::string, boost::beast::iless> map;
277  auto const v = beast::rfc2616::split(
278  s.begin(), s.end(), ',');
279  for (auto const& kv : v)
280  {
281  boost::smatch m;
282  if (! boost::regex_match (kv, m, re1))
283  Throw<std::runtime_error> (
284  "invalid parameter " + kv);
285  auto const result =
286  map.emplace(m[1], m[2]);
287  if (! result.second)
288  Throw<std::runtime_error> (
289  "duplicate parameter " + m[1]);
290  }
291  return map;
292 }
293 
294 //------------------------------------------------------------------------------
295 
296 #if RIPPLE_ROCKSDB_AVAILABLE
297 
298 class import_test : public beast::unit_test::suite
299 {
300 public:
301  void
302  run() override
303  {
304  testcase(beast::unit_test::abort_on_fail) << arg();
305 
306  using namespace nudb;
307  using namespace nudb::detail;
308 
309  pass();
310  auto const args = parse_args(arg());
311  bool usage = args.empty();
312 
313  if (! usage &&
314  args.find("from") == args.end())
315  {
316  log <<
317  "Missing parameter: from";
318  usage = true;
319  }
320  if (! usage &&
321  args.find("to") == args.end())
322  {
323  log <<
324  "Missing parameter: to";
325  usage = true;
326  }
327  if (! usage &&
328  args.find("buffer") == args.end())
329  {
330  log <<
331  "Missing parameter: buffer";
332  usage = true;
333  }
334 
335  if (usage)
336  {
337  log <<
338  "Usage:\n" <<
339  "--unittest-arg=from=<from>,to=<to>,buffer=<buffer>\n" <<
340  "from: RocksDB database to import from\n" <<
341  "to: NuDB database to import to\n" <<
342  "buffer: Buffer size (bigger is faster)\n" <<
343  "NuDB database must not already exist.";
344  return;
345  }
346 
347  // This controls the size of the bucket buffer.
348  // For a 1TB data file, a 32GB bucket buffer is suggested.
349  // The larger the buffer, the faster the import.
350  //
351  std::size_t const buffer_size =
352  std::stoull(args.at("buffer"));
353  auto const from_path = args.at("from");
354  auto const to_path = args.at("to");
355 
356  using hash_type = nudb::xxhasher;
357  auto const bulk_size = 64 * 1024 * 1024;
358  float const load_factor = 0.5;
359 
360  auto const dp = to_path + ".dat";
361  auto const kp = to_path + ".key";
362 
363  auto const start =
365 
366  log <<
367  "from: " << from_path << "\n"
368  "to: " << to_path << "\n"
369  "buffer: " << buffer_size;
370 
372  {
373  rocksdb::Options options;
374  options.create_if_missing = false;
375  options.max_open_files = 2000; // 5000?
376  rocksdb::DB* pdb = nullptr;
377  rocksdb::Status status =
378  rocksdb::DB::OpenForReadOnly(
379  options, from_path, &pdb);
380  if (! status.ok () || ! pdb)
381  Throw<std::runtime_error> (
382  "Can't open '" + from_path + "': " +
383  status.ToString());
384  db.reset(pdb);
385  }
386  // Create data file with values
387  std::size_t nitems = 0;
388  std::size_t nbytes = 0;
389  dat_file_header dh;
390  dh.version = currentVersion;
391  dh.uid = make_uid();
392  dh.appnum = 1;
393  dh.key_size = 32;
394 
395  native_file df;
396  error_code ec;
397  df.create(file_mode::append, dp, ec);
398  if (ec)
399  Throw<nudb::system_error>(ec);
400  bulk_writer<native_file> dw(
401  df, 0, bulk_size);
402  {
403  {
404  auto os = dw.prepare(dat_file_header::size, ec);
405  if (ec)
406  Throw<nudb::system_error>(ec);
407  write(os, dh);
408  }
409  rocksdb::ReadOptions options;
410  options.verify_checksums = false;
411  options.fill_cache = false;
413  db->NewIterator(options));
414 
415  buffer buf;
416  for (it->SeekToFirst (); it->Valid (); it->Next())
417  {
418  if (it->key().size() != 32)
419  Throw<std::runtime_error> (
420  "Unexpected key size " +
421  std::to_string(it->key().size()));
422  void const* const key = it->key().data();
423  void const* const data = it->value().data();
424  auto const size = it->value().size();
426  new char[size]);
427  std::memcpy(clean.get(), data, size);
428  filter_inner(clean.get(), size);
429  auto const out = nodeobject_compress(
430  clean.get(), size, buf);
431  // Verify codec correctness
432  {
433  buffer buf2;
434  auto const check = nodeobject_decompress(
435  out.first, out.second, buf2);
436  BEAST_EXPECT(check.second == size);
437  BEAST_EXPECT(std::memcmp(
438  check.first, clean.get(), size) == 0);
439  }
440  // Data Record
441  auto os = dw.prepare(
442  field<uint48_t>::size + // Size
443  32 + // Key
444  out.second, ec);
445  if (ec)
446  Throw<nudb::system_error>(ec);
447  write<uint48_t>(os, out.second);
448  std::memcpy(os.data(32), key, 32);
449  std::memcpy(os.data(out.second),
450  out.first, out.second);
451  ++nitems;
452  nbytes += size;
453  }
454  dw.flush(ec);
455  if (ec)
456  Throw<nudb::system_error>(ec);
457  }
458  db.reset();
459  log <<
460  "Import data: " << detail::fmtdur(
462  auto const df_size = df.size(ec);
463  if (ec)
464  Throw<nudb::system_error>(ec);
465  // Create key file
466  key_file_header kh;
467  kh.version = currentVersion;
468  kh.uid = dh.uid;
469  kh.appnum = dh.appnum;
470  kh.key_size = 32;
471  kh.salt = make_salt();
472  kh.pepper = pepper<hash_type>(kh.salt);
473  kh.block_size = block_size(kp);
474  kh.load_factor = std::min<std::size_t>(
475  65536.0 * load_factor, 65535);
476  kh.buckets = std::ceil(nitems / (bucket_capacity(
477  kh.block_size) * load_factor));
478  kh.modulus = ceil_pow2(kh.buckets);
479  native_file kf;
480  kf.create(file_mode::append, kp, ec);
481  if (ec)
482  Throw<nudb::system_error>(ec);
483  buffer buf(kh.block_size);
484  {
485  std::memset(buf.get(), 0, kh.block_size);
486  ostream os(buf.get(), kh.block_size);
487  write(os, kh);
488  kf.write(0, buf.get(), kh.block_size, ec);
489  if (ec)
490  Throw<nudb::system_error>(ec);
491  }
492  // Build contiguous sequential sections of the
493  // key file using multiple passes over the data.
494  //
495  auto const buckets = std::max<std::size_t>(1,
496  buffer_size / kh.block_size);
497  buf.reserve(buckets * kh.block_size);
498  auto const passes =
499  (kh.buckets + buckets - 1) / buckets;
500  log <<
501  "items: " << nitems << "\n"
502  "buckets: " << kh.buckets << "\n"
503  "data: " << df_size << "\n"
504  "passes: " << passes;
505  progress p(df_size * passes);
506  std::size_t npass = 0;
507  for (std::size_t b0 = 0; b0 < kh.buckets;
508  b0 += buckets)
509  {
510  auto const b1 = std::min(
511  b0 + buckets, kh.buckets);
512  // Buffered range is [b0, b1)
513  auto const bn = b1 - b0;
514  // Create empty buckets
515  for (std::size_t i = 0; i < bn; ++i)
516  {
517  bucket b(kh.block_size,
518  buf.get() + i * kh.block_size,
519  empty);
520  }
521  // Insert all keys into buckets
522  // Iterate Data File
523  bulk_reader<native_file> r(
524  df, dat_file_header::size,
525  df_size, bulk_size);
526  while (! r.eof())
527  {
528  auto const offset = r.offset();
529  // Data Record or Spill Record
531  auto is = r.prepare(
532  field<uint48_t>::size, ec); // Size
533  if (ec)
534  Throw<nudb::system_error>(ec);
535  read<uint48_t>(is, size);
536  if (size > 0)
537  {
538  // Data Record
539  is = r.prepare(
540  dh.key_size + // Key
541  size, ec); // Data
542  if (ec)
543  Throw<nudb::system_error>(ec);
544  std::uint8_t const* const key =
545  is.data(dh.key_size);
546  auto const h = hash<hash_type>(
547  key, kh.key_size, kh.salt);
548  auto const n = bucket_index(
549  h, kh.buckets, kh.modulus);
550  p(log,
551  npass * df_size + r.offset());
552  if (n < b0 || n >= b1)
553  continue;
554  bucket b(kh.block_size, buf.get() +
555  (n - b0) * kh.block_size);
556  maybe_spill(b, dw, ec);
557  if (ec)
558  Throw<nudb::system_error>(ec);
559  b.insert(offset, size, h);
560  }
561  else
562  {
563  // VFALCO Should never get here
564  // Spill Record
565  is = r.prepare(
566  field<std::uint16_t>::size, ec);
567  if (ec)
568  Throw<nudb::system_error>(ec);
569  read<std::uint16_t>(is, size); // Size
570  r.prepare(size, ec); // skip
571  if (ec)
572  Throw<nudb::system_error>(ec);
573  }
574  }
575  kf.write((b0 + 1) * kh.block_size,
576  buf.get(), bn * kh.block_size, ec);
577  if (ec)
578  Throw<nudb::system_error>(ec);
579  ++npass;
580  }
581  dw.flush(ec);
582  if (ec)
583  Throw<nudb::system_error>(ec);
584  p.finish(log);
585  }
586 };
587 
588 BEAST_DEFINE_TESTSUITE_MANUAL(import,NodeStore,ripple);
589 
590 #endif
591 
592 //------------------------------------------------------------------------------
593 
594 } // NodeStore
595 } // ripple
596 
ripple::NodeStore::detail::fmtdur
std::string fmtdur(std::chrono::duration< Period, Rep > const &d)
Definition: import_test.cpp:181
ripple::NodeStore::nodeobject_decompress
std::pair< void const *, std::size_t > nodeobject_decompress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:109
sstream
std::setprecision
T setprecision(T... args)
std::chrono::steady_clock
std::string
STL class.
ripple::NodeStore::parse_args
std::map< std::string, std::string, boost::beast::iless > parse_args(std::string const &s)
Definition: import_test.cpp:261
ripple::NodeStore::detail::save_stream_state::os_
std::ostream & os_
Definition: import_test.cpp:67
ripple::NodeStore::progress::finish
void finish(Log &log)
Definition: import_test.cpp:252
ripple::NodeStore::detail::save_stream_state::fill_
std::ios::char_type fill_
Definition: import_test.cpp:70
std::stoull
T stoull(T... args)
ripple::NodeStore::detail::save_stream_state::save_stream_state
save_stream_state(save_stream_state const &)=delete
beast::rfc2616::split
Result split(FwdIt first, FwdIt last, Char delim)
Parse a character sequence of values separated by commas.
Definition: rfc2616.h:209
std::size
T size(T... args)
std::chrono::duration
std::stringstream
STL class.
Json::check
void check(bool condition, std::string const &message)
Definition: json/Writer.h:234
ripple::NodeStore::progress::work_
const std::size_t work_
Definition: import_test.cpp:199
ripple::NodeStore::detail::save_stream_state
Definition: import_test.cpp:65
beast::basic_seconds_clock::duration
typename Clock::duration duration
Definition: basic_seconds_clock.h:165
ripple::NodeStore::write
void write(nudb::detail::ostream &os, std::size_t t)
Definition: varint.h:145
std::unique_ptr::reset
T reset(T... args)
algorithm
std::ostream::fill
T fill(T... args)
ripple::NodeStore::detail::save_stream_state::flags_
std::ios::fmtflags flags_
Definition: import_test.cpp:69
ripple::NodeStore::detail::save_stream_state::operator=
save_stream_state & operator=(save_stream_state const &)=delete
ripple::QualityDirection::out
@ out
beast::basic_seconds_clock
A clock whose minimum resolution is one second.
Definition: basic_seconds_clock.h:158
std::streamsize
std::log
T log(T... args)
ripple::NodeStore::detail::save_stream_state::save_stream_state
save_stream_state(std::ostream &os)
Definition: import_test.cpp:80
ripple::NodeStore::progress
Definition: import_test.cpp:192
std::ostream
STL class.
chrono
ripple::NodeStore::progress::progress
progress(std::size_t work)
Definition: import_test.cpp:208
std::to_string
T to_string(T... args)
ripple::NodeStore::filter_inner
void filter_inner(void *in, std::size_t in_size)
Definition: codec.h:338
std::uint8_t
std::ostream::flags
T flags(T... args)
map
beast::basic_seconds_clock::rep
typename Clock::rep rep
Definition: basic_seconds_clock.h:163
std::ceil
T ceil(T... args)
std::experimental::filesystem::status
T status(T... args)
ripple::NodeStore::detail::pretty_time
std::ostream & pretty_time(std::ostream &os, std::chrono::duration< Rep, Period > d)
Definition: import_test.cpp:91
std::min
T min(T... args)
ripple::NodeStore::progress::operator()
void operator()(Log &log, std::size_t work)
Definition: import_test.cpp:215
ripple
Use hash_* containers for keys that do not need a cryptographically secure hashing algorithm.
Definition: RCLCensorshipDetector.h:29
ripple::NodeStore::detail::save_stream_state::precision_
std::streamsize precision_
Definition: import_test.cpp:68
iomanip
std::string::begin
T begin(T... args)
std::fixed
T fixed(T... args)
ripple::test::BEAST_DEFINE_TESTSUITE_MANUAL
BEAST_DEFINE_TESTSUITE_MANUAL(DetectCrash, unit_test, beast)
std::stringstream::str
T str(T... args)
std::size_t
std::memcpy
T memcpy(T... args)
ripple::NodeStore::nodeobject_compress
std::pair< void const *, std::size_t > nodeobject_compress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:226
std::string::end
T end(T... args)
std::memcmp
T memcmp(T... args)
std::unique_ptr
STL class.
ripple::NodeStore::detail::save_stream_state::~save_stream_state
~save_stream_state()
Definition: import_test.cpp:72
std::data
T data(T... args)
beast::basic_seconds_clock::time_point
typename Clock::time_point time_point
Definition: basic_seconds_clock.h:166
std::memset
T memset(T... args)
std::ostream::precision
T precision(T... args)
std::chrono
std::chrono::steady_clock::now
T now(T... args)