rippled
Loading...
Searching...
No Matches
import_test.cpp
1//------------------------------------------------------------------------------
2/*
3 This file is part of rippled: https://github.com/ripple/rippled
4 Copyright (c) 2012, 2013 Ripple Labs Inc.
5
6 Permission to use, copy, modify, and/or distribute this software for any
7 purpose with or without fee is hereby granted, provided that the above
8 copyright notice and this permission notice appear in all copies.
9
10 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17*/
18//==============================================================================
19
20#include <xrpld/nodestore/detail/codec.h>
21#include <xrpld/unity/rocksdb.h>
22
23#include <xrpl/basics/contract.h>
24#include <xrpl/beast/clock/basic_seconds_clock.h>
25#include <xrpl/beast/core/LexicalCast.h>
26#include <xrpl/beast/rfc2616.h>
27#include <xrpl/beast/unit_test.h>
28
29#include <boost/beast/core/string.hpp>
30#include <boost/regex.hpp>
31
32#include <nudb/create.hpp>
33#include <nudb/detail/format.hpp>
34#include <nudb/xxhasher.hpp>
35
36#include <algorithm>
37#include <chrono>
38#include <iomanip>
39#include <map>
40#include <sstream>
41
42/*
43
44Math:
45
461000 gb dat file
47170 gb key file
48capacity 113 keys/bucket
49
50normal:
511,000gb data file read
5219,210gb key file read (113 * 170)
5319,210gb key file write
54
55multi(32gb):
566 passes (170/32)
576,000gb data file read
58170gb key file write
59
60
61*/
62
63namespace ripple {
64namespace NodeStore {
65
66namespace detail {
67
69{
72 std::ios::fmtflags flags_;
73 std::ios::char_type fill_;
74
75public:
77 {
80 os_.fill(fill_);
81 }
84 operator=(save_stream_state const&) = delete;
86 : os_(os)
87 , precision_(os.precision())
88 , flags_(os.flags())
89 , fill_(os.fill())
90 {
91 }
92};
93
94template <class Rep, class Period>
97{
99 using namespace std::chrono;
100 if (d < microseconds{1})
101 {
102 // use nanoseconds
103 if (d < nanoseconds{100})
104 {
105 // use floating
107 os << std::fixed << std::setprecision(1) << ns(d).count();
108 }
109 else
110 {
111 // use integral
112 os << round<nanoseconds>(d).count();
113 }
114 os << "ns";
115 }
116 else if (d < milliseconds{1})
117 {
118 // use microseconds
119 if (d < microseconds{100})
120 {
121 // use floating
123 os << std::fixed << std::setprecision(1) << ms(d).count();
124 }
125 else
126 {
127 // use integral
128 os << round<microseconds>(d).count();
129 }
130 os << "us";
131 }
132 else if (d < seconds{1})
133 {
134 // use milliseconds
135 if (d < milliseconds{100})
136 {
137 // use floating
139 os << std::fixed << std::setprecision(1) << ms(d).count();
140 }
141 else
142 {
143 // use integral
144 os << round<milliseconds>(d).count();
145 }
146 os << "ms";
147 }
148 else if (d < minutes{1})
149 {
150 // use seconds
151 if (d < seconds{100})
152 {
153 // use floating
154 using s = duration<float>;
155 os << std::fixed << std::setprecision(1) << s(d).count();
156 }
157 else
158 {
159 // use integral
160 os << round<seconds>(d).count();
161 }
162 os << "s";
163 }
164 else
165 {
166 // use minutes
167 if (d < minutes{100})
168 {
169 // use floating
171 os << std::fixed << std::setprecision(1) << m(d).count();
172 }
173 else
174 {
175 // use integral
176 os << round<minutes>(d).count();
177 }
178 os << "min";
179 }
180 return os;
181}
182
183template <class Period, class Rep>
184inline std::string
186{
188 pretty_time(ss, d);
189 return ss.str();
190}
191
192} // namespace detail
193
194//------------------------------------------------------------------------------
195
197{
198private:
200
206 bool estimate_ = false;
207
208public:
209 explicit progress(std::size_t work) : work_(work)
210 {
211 }
212
213 template <class Log>
214 void
215 operator()(Log& log, std::size_t work)
216 {
217 using namespace std::chrono;
218 auto const now = clock_type::now();
219 if (now == now_)
220 return;
221 now_ = now;
222 auto const elapsed = now - start_;
223 if (!estimate_)
224 {
225 if (elapsed < seconds(15))
226 return;
227 estimate_ = true;
228 }
229 else if (now - report_ < std::chrono::seconds(60))
230 {
231 return;
232 }
233 auto const rate = elapsed.count() / double(work);
234 clock_type::duration const remain(
235 static_cast<clock_type::duration::rep>((work_ - work) * rate));
236 log << "Remaining: " << detail::fmtdur(remain) << " (" << work << " of "
237 << work_ << " in " << detail::fmtdur(elapsed) << ", "
238 << (work - prev_) << " in " << detail::fmtdur(now - report_) << ")";
239 report_ = now;
240 prev_ = work;
241 }
242
243 template <class Log>
244 void
245 finish(Log& log)
246 {
247 log << "Total time: " << detail::fmtdur(clock_type::now() - start_);
248 }
249};
250
253{
254 // <key> '=' <value>
255 static boost::regex const re1(
256 "^" // start of line
257 "(?:\\s*)" // whitespace (optonal)
258 "([a-zA-Z][_a-zA-Z0-9]*)" // <key>
259 "(?:\\s*)" // whitespace (optional)
260 "(?:=)" // '='
261 "(?:\\s*)" // whitespace (optional)
262 "(.*\\S+)" // <value>
263 "(?:\\s*)" // whitespace (optional)
264 ,
265 boost::regex_constants::optimize);
267 auto const v = beast::rfc2616::split(s.begin(), s.end(), ',');
268 for (auto const& kv : v)
269 {
270 boost::smatch m;
271 if (!boost::regex_match(kv, m, re1))
272 Throw<std::runtime_error>("invalid parameter " + kv);
273 auto const result = map.emplace(m[1], m[2]);
274 if (!result.second)
275 Throw<std::runtime_error>("duplicate parameter " + m[1]);
276 }
277 return map;
278}
279
280//------------------------------------------------------------------------------
281
282#if RIPPLE_ROCKSDB_AVAILABLE
283
284class import_test : public beast::unit_test::suite
285{
286public:
287 void
288 run() override
289 {
290 testcase(beast::unit_test::abort_on_fail) << arg();
291
292 using namespace nudb;
293 using namespace nudb::detail;
294
295 pass();
296 auto const args = parse_args(arg());
297 bool usage = args.empty();
298
299 if (!usage && args.find("from") == args.end())
300 {
301 log << "Missing parameter: from";
302 usage = true;
303 }
304 if (!usage && args.find("to") == args.end())
305 {
306 log << "Missing parameter: to";
307 usage = true;
308 }
309 if (!usage && args.find("buffer") == args.end())
310 {
311 log << "Missing parameter: buffer";
312 usage = true;
313 }
314
315 if (usage)
316 {
317 log << "Usage:\n"
318 << "--unittest-arg=from=<from>,to=<to>,buffer=<buffer>\n"
319 << "from: RocksDB database to import from\n"
320 << "to: NuDB database to import to\n"
321 << "buffer: Buffer size (bigger is faster)\n"
322 << "NuDB database must not already exist.";
323 return;
324 }
325
326 // This controls the size of the bucket buffer.
327 // For a 1TB data file, a 32GB bucket buffer is suggested.
328 // The larger the buffer, the faster the import.
329 //
330 std::size_t const buffer_size = std::stoull(args.at("buffer"));
331 auto const from_path = args.at("from");
332 auto const to_path = args.at("to");
333
334 using hash_type = nudb::xxhasher;
335 auto const bulk_size = 64 * 1024 * 1024;
336 float const load_factor = 0.5;
337
338 auto const dp = to_path + ".dat";
339 auto const kp = to_path + ".key";
340
341 auto const start = std::chrono::steady_clock::now();
342
343 log << "from: " << from_path
344 << "\n"
345 "to: "
346 << to_path
347 << "\n"
348 "buffer: "
349 << buffer_size;
350
352 {
353 rocksdb::Options options;
354 options.create_if_missing = false;
355 options.max_open_files = 2000; // 5000?
356 rocksdb::DB* pdb = nullptr;
357 rocksdb::Status status =
358 rocksdb::DB::OpenForReadOnly(options, from_path, &pdb);
359 if (!status.ok() || !pdb)
360 Throw<std::runtime_error>(
361 "Can't open '" + from_path + "': " + status.ToString());
362 db.reset(pdb);
363 }
364 // Create data file with values
365 std::size_t nitems = 0;
366 dat_file_header dh;
367 dh.version = currentVersion;
368 dh.uid = make_uid();
369 dh.appnum = 1;
370 dh.key_size = 32;
371
372 native_file df;
373 error_code ec;
374 df.create(file_mode::append, dp, ec);
375 if (ec)
376 Throw<nudb::system_error>(ec);
377 bulk_writer<native_file> dw(df, 0, bulk_size);
378 {
379 {
380 auto os = dw.prepare(dat_file_header::size, ec);
381 if (ec)
382 Throw<nudb::system_error>(ec);
383 write(os, dh);
384 }
385 rocksdb::ReadOptions options;
386 options.verify_checksums = false;
387 options.fill_cache = false;
388 std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(options));
389
390 buffer buf;
391 for (it->SeekToFirst(); it->Valid(); it->Next())
392 {
393 if (it->key().size() != 32)
394 Throw<std::runtime_error>(
395 "Unexpected key size " +
396 std::to_string(it->key().size()));
397 void const* const key = it->key().data();
398 void const* const data = it->value().data();
399 auto const size = it->value().size();
400 std::unique_ptr<char[]> clean(new char[size]);
401 std::memcpy(clean.get(), data, size);
402 filter_inner(clean.get(), size);
403 auto const out = nodeobject_compress(clean.get(), size, buf);
404 // Verify codec correctness
405 {
406 buffer buf2;
407 auto const check =
408 nodeobject_decompress(out.first, out.second, buf2);
409 BEAST_EXPECT(check.second == size);
410 BEAST_EXPECT(
411 std::memcmp(check.first, clean.get(), size) == 0);
412 }
413 // Data Record
414 auto os = dw.prepare(
415 field<uint48_t>::size + // Size
416 32 + // Key
417 out.second,
418 ec);
419 if (ec)
420 Throw<nudb::system_error>(ec);
421 write<uint48_t>(os, out.second);
422 std::memcpy(os.data(32), key, 32);
423 std::memcpy(os.data(out.second), out.first, out.second);
424 ++nitems;
425 }
426 dw.flush(ec);
427 if (ec)
428 Throw<nudb::system_error>(ec);
429 }
430 db.reset();
431 log << "Import data: "
432 << detail::fmtdur(std::chrono::steady_clock::now() - start);
433 auto const df_size = df.size(ec);
434 if (ec)
435 Throw<nudb::system_error>(ec);
436 // Create key file
437 key_file_header kh;
438 kh.version = currentVersion;
439 kh.uid = dh.uid;
440 kh.appnum = dh.appnum;
441 kh.key_size = 32;
442 kh.salt = make_salt();
443 kh.pepper = pepper<hash_type>(kh.salt);
444 kh.block_size = block_size(kp);
445 kh.load_factor = std::min<std::size_t>(65536.0 * load_factor, 65535);
446 kh.buckets =
447 std::ceil(nitems / (bucket_capacity(kh.block_size) * load_factor));
448 kh.modulus = ceil_pow2(kh.buckets);
449 native_file kf;
450 kf.create(file_mode::append, kp, ec);
451 if (ec)
452 Throw<nudb::system_error>(ec);
453 buffer buf(kh.block_size);
454 {
455 std::memset(buf.get(), 0, kh.block_size);
456 ostream os(buf.get(), kh.block_size);
457 write(os, kh);
458 kf.write(0, buf.get(), kh.block_size, ec);
459 if (ec)
460 Throw<nudb::system_error>(ec);
461 }
462 // Build contiguous sequential sections of the
463 // key file using multiple passes over the data.
464 //
465 auto const buckets =
466 std::max<std::size_t>(1, buffer_size / kh.block_size);
467 buf.reserve(buckets * kh.block_size);
468 auto const passes = (kh.buckets + buckets - 1) / buckets;
469 log << "items: " << nitems
470 << "\n"
471 "buckets: "
472 << kh.buckets
473 << "\n"
474 "data: "
475 << df_size
476 << "\n"
477 "passes: "
478 << passes;
479 progress p(df_size * passes);
480 std::size_t npass = 0;
481 for (std::size_t b0 = 0; b0 < kh.buckets; b0 += buckets)
482 {
483 auto const b1 = std::min(b0 + buckets, kh.buckets);
484 // Buffered range is [b0, b1)
485 auto const bn = b1 - b0;
486 // Create empty buckets
487 for (std::size_t i = 0; i < bn; ++i)
488 {
489 bucket b(kh.block_size, buf.get() + i * kh.block_size, empty);
490 }
491 // Insert all keys into buckets
492 // Iterate Data File
493 bulk_reader<native_file> r(
494 df, dat_file_header::size, df_size, bulk_size);
495 while (!r.eof())
496 {
497 auto const offset = r.offset();
498 // Data Record or Spill Record
500 auto is = r.prepare(field<uint48_t>::size, ec); // Size
501 if (ec)
502 Throw<nudb::system_error>(ec);
503 read<uint48_t>(is, size);
504 if (size > 0)
505 {
506 // Data Record
507 is = r.prepare(
508 dh.key_size + // Key
509 size,
510 ec); // Data
511 if (ec)
512 Throw<nudb::system_error>(ec);
513 std::uint8_t const* const key = is.data(dh.key_size);
514 auto const h = hash<hash_type>(key, kh.key_size, kh.salt);
515 auto const n = bucket_index(h, kh.buckets, kh.modulus);
516 p(log, npass * df_size + r.offset());
517 if (n < b0 || n >= b1)
518 continue;
519 bucket b(
520 kh.block_size, buf.get() + (n - b0) * kh.block_size);
521 maybe_spill(b, dw, ec);
522 if (ec)
523 Throw<nudb::system_error>(ec);
524 b.insert(offset, size, h);
525 }
526 else
527 {
528 // VFALCO Should never get here
529 // Spill Record
530 is = r.prepare(field<std::uint16_t>::size, ec);
531 if (ec)
532 Throw<nudb::system_error>(ec);
533 read<std::uint16_t>(is, size); // Size
534 r.prepare(size, ec); // skip
535 if (ec)
536 Throw<nudb::system_error>(ec);
537 }
538 }
539 kf.write(
540 (b0 + 1) * kh.block_size, buf.get(), bn * kh.block_size, ec);
541 if (ec)
542 Throw<nudb::system_error>(ec);
543 ++npass;
544 }
545 dw.flush(ec);
546 if (ec)
547 Throw<nudb::system_error>(ec);
548 p.finish(log);
549 }
550};
551
552BEAST_DEFINE_TESTSUITE_MANUAL(import, NodeStore, ripple);
553
554#endif
555
556//------------------------------------------------------------------------------
557
558} // namespace NodeStore
559} // namespace ripple
T begin(T... args)
T ceil(T... args)
A clock whose minimum resolution is one second.
typename Clock::duration duration
typename Clock::time_point time_point
A testsuite class.
Definition: suite.h:55
save_stream_state(save_stream_state const &)=delete
save_stream_state & operator=(save_stream_state const &)=delete
void operator()(Log &log, std::size_t work)
clock_type::time_point start_
clock_type::time_point report_
progress(std::size_t work)
clock_type::time_point now_
T data(T... args)
T emplace(T... args)
T end(T... args)
T fill(T... args)
T fixed(T... args)
T flags(T... args)
T log(T... args)
T memcmp(T... args)
T memcpy(T... args)
T memset(T... args)
T min(T... args)
void check(bool condition, std::string const &message)
Definition: json/Writer.h:253
Result split(FwdIt first, FwdIt last, Char delim)
Parse a character sequence of values separated by commas.
Definition: rfc2616.h:125
std::ostream & pretty_time(std::ostream &os, std::chrono::duration< Rep, Period > d)
Definition: import_test.cpp:96
std::string fmtdur(std::chrono::duration< Period, Rep > const &d)
std::map< std::string, std::string, boost::beast::iless > parse_args(std::string const &s)
void filter_inner(void *in, std::size_t in_size)
Definition: codec.h:318
void write(nudb::detail::ostream &os, std::size_t t)
Definition: varint.h:134
std::pair< void const *, std::size_t > nodeobject_decompress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:111
std::pair< void const *, std::size_t > nodeobject_compress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:222
Use hash_* containers for keys that do not need a cryptographically secure hashing algorithm.
Definition: algorithm.h:26
int run(int argc, char **argv)
Definition: Main.cpp:343
T precision(T... args)
T reset(T... args)
T setprecision(T... args)
T size(T... args)
T stoull(T... args)
T str(T... args)
T to_string(T... args)