rippled
Loading...
Searching...
No Matches
import_test.cpp
1//------------------------------------------------------------------------------
2/*
3 This file is part of rippled: https://github.com/ripple/rippled
4 Copyright (c) 2012, 2013 Ripple Labs Inc.
5
6 Permission to use, copy, modify, and/or distribute this software for any
7 purpose with or without fee is hereby granted, provided that the above
8 copyright notice and this permission notice appear in all copies.
9
10 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17*/
18//==============================================================================
19
20#include <xrpld/nodestore/detail/codec.h>
21#include <xrpld/unity/rocksdb.h>
22
23#include <xrpl/basics/contract.h>
24#include <xrpl/beast/clock/basic_seconds_clock.h>
25#include <xrpl/beast/core/LexicalCast.h>
26#include <xrpl/beast/rfc2616.h>
27#include <xrpl/beast/unit_test.h>
28
29#include <boost/beast/core/string.hpp>
30#include <boost/regex.hpp>
31
32#include <nudb/create.hpp>
33#include <nudb/detail/format.hpp>
34#include <nudb/xxhasher.hpp>
35
36#include <algorithm>
37#include <chrono>
38#include <iomanip>
39#include <map>
40#include <sstream>
41
42/*
43
44Math:
45
461000 gb dat file
47170 gb key file
48capacity 113 keys/bucket
49
50normal:
511,000gb data file read
5219,210gb key file read (113 * 170)
5319,210gb key file write
54
55multi(32gb):
566 passes (170/32)
576,000gb data file read
58170gb key file write
59
60
61*/
62
63namespace ripple {
64
65namespace detail {
66
68{
71 std::ios::fmtflags flags_;
72 std::ios::char_type fill_;
73
74public:
83 operator=(save_stream_state const&) = delete;
85 : os_(os)
86 , precision_(os.precision())
87 , flags_(os.flags())
88 , fill_(os.fill())
89 {
90 }
91};
92
93template <class Rep, class Period>
96{
98 using namespace std::chrono;
99 if (d < microseconds{1})
100 {
101 // use nanoseconds
102 if (d < nanoseconds{100})
103 {
104 // use floating
106 os << std::fixed << std::setprecision(1) << ns(d).count();
107 }
108 else
109 {
110 // use integral
111 os << round<nanoseconds>(d).count();
112 }
113 os << "ns";
114 }
115 else if (d < milliseconds{1})
116 {
117 // use microseconds
118 if (d < microseconds{100})
119 {
120 // use floating
122 os << std::fixed << std::setprecision(1) << ms(d).count();
123 }
124 else
125 {
126 // use integral
127 os << round<microseconds>(d).count();
128 }
129 os << "us";
130 }
131 else if (d < seconds{1})
132 {
133 // use milliseconds
134 if (d < milliseconds{100})
135 {
136 // use floating
138 os << std::fixed << std::setprecision(1) << ms(d).count();
139 }
140 else
141 {
142 // use integral
143 os << round<milliseconds>(d).count();
144 }
145 os << "ms";
146 }
147 else if (d < minutes{1})
148 {
149 // use seconds
150 if (d < seconds{100})
151 {
152 // use floating
153 using s = duration<float>;
154 os << std::fixed << std::setprecision(1) << s(d).count();
155 }
156 else
157 {
158 // use integral
159 os << round<seconds>(d).count();
160 }
161 os << "s";
162 }
163 else
164 {
165 // use minutes
166 if (d < minutes{100})
167 {
168 // use floating
170 os << std::fixed << std::setprecision(1) << m(d).count();
171 }
172 else
173 {
174 // use integral
175 os << round<minutes>(d).count();
176 }
177 os << "min";
178 }
179 return os;
180}
181
182template <class Period, class Rep>
183inline std::string
185{
187 pretty_time(ss, d);
188 return ss.str();
189}
190
191} // namespace detail
192
193namespace NodeStore {
194
195//------------------------------------------------------------------------------
196
198{
199private:
201
207 bool estimate_ = false;
208
209public:
210 explicit progress(std::size_t work) : work_(work)
211 {
212 }
213
214 template <class Log>
215 void
216 operator()(Log& log, std::size_t work)
217 {
218 using namespace std::chrono;
219 auto const now = clock_type::now();
220 if (now == now_)
221 return;
222 now_ = now;
223 auto const elapsed = now - start_;
224 if (!estimate_)
225 {
226 if (elapsed < seconds(15))
227 return;
228 estimate_ = true;
229 }
230 else if (now - report_ < std::chrono::seconds(60))
231 {
232 return;
233 }
234 auto const rate = elapsed.count() / double(work);
235 clock_type::duration const remain(
236 static_cast<clock_type::duration::rep>((work_ - work) * rate));
237 log << "Remaining: " << detail::fmtdur(remain) << " (" << work << " of "
238 << work_ << " in " << detail::fmtdur(elapsed) << ", "
239 << (work - prev_) << " in " << detail::fmtdur(now - report_) << ")";
240 report_ = now;
241 prev_ = work;
242 }
243
244 template <class Log>
245 void
246 finish(Log& log)
247 {
248 log << "Total time: " << detail::fmtdur(clock_type::now() - start_);
249 }
250};
251
254{
255 // <key> '=' <value>
256 static boost::regex const re1(
257 "^" // start of line
258 "(?:\\s*)" // whitespace (optonal)
259 "([a-zA-Z][_a-zA-Z0-9]*)" // <key>
260 "(?:\\s*)" // whitespace (optional)
261 "(?:=)" // '='
262 "(?:\\s*)" // whitespace (optional)
263 "(.*\\S+)" // <value>
264 "(?:\\s*)" // whitespace (optional)
265 ,
266 boost::regex_constants::optimize);
268 auto const v = beast::rfc2616::split(s.begin(), s.end(), ',');
269 for (auto const& kv : v)
270 {
271 boost::smatch m;
272 if (!boost::regex_match(kv, m, re1))
273 Throw<std::runtime_error>("invalid parameter " + kv);
274 auto const result = map.emplace(m[1], m[2]);
275 if (!result.second)
276 Throw<std::runtime_error>("duplicate parameter " + m[1]);
277 }
278 return map;
279}
280
281//------------------------------------------------------------------------------
282
283#if RIPPLE_ROCKSDB_AVAILABLE
284
285class import_test : public beast::unit_test::suite
286{
287public:
288 void
289 run() override
290 {
291 testcase(beast::unit_test::abort_on_fail) << arg();
292
293 using namespace nudb;
294 using namespace nudb::detail;
295
296 pass();
297 auto const args = parse_args(arg());
298 bool usage = args.empty();
299
300 if (!usage && args.find("from") == args.end())
301 {
302 log << "Missing parameter: from";
303 usage = true;
304 }
305 if (!usage && args.find("to") == args.end())
306 {
307 log << "Missing parameter: to";
308 usage = true;
309 }
310 if (!usage && args.find("buffer") == args.end())
311 {
312 log << "Missing parameter: buffer";
313 usage = true;
314 }
315
316 if (usage)
317 {
318 log << "Usage:\n"
319 << "--unittest-arg=from=<from>,to=<to>,buffer=<buffer>\n"
320 << "from: RocksDB database to import from\n"
321 << "to: NuDB database to import to\n"
322 << "buffer: Buffer size (bigger is faster)\n"
323 << "NuDB database must not already exist.";
324 return;
325 }
326
327 // This controls the size of the bucket buffer.
328 // For a 1TB data file, a 32GB bucket buffer is suggested.
329 // The larger the buffer, the faster the import.
330 //
331 std::size_t const buffer_size = std::stoull(args.at("buffer"));
332 auto const from_path = args.at("from");
333 auto const to_path = args.at("to");
334
335 using hash_type = nudb::xxhasher;
336 auto const bulk_size = 64 * 1024 * 1024;
337 float const load_factor = 0.5;
338
339 auto const dp = to_path + ".dat";
340 auto const kp = to_path + ".key";
341
342 auto const start = std::chrono::steady_clock::now();
343
344 log << "from: " << from_path
345 << "\n"
346 "to: "
347 << to_path
348 << "\n"
349 "buffer: "
350 << buffer_size;
351
353 {
354 rocksdb::Options options;
355 options.create_if_missing = false;
356 options.max_open_files = 2000; // 5000?
357 rocksdb::DB* pdb = nullptr;
358 rocksdb::Status status =
359 rocksdb::DB::OpenForReadOnly(options, from_path, &pdb);
360 if (!status.ok() || !pdb)
361 Throw<std::runtime_error>(
362 "Can't open '" + from_path + "': " + status.ToString());
363 db.reset(pdb);
364 }
365 // Create data file with values
366 std::size_t nitems = 0;
367 dat_file_header dh;
368 dh.version = currentVersion;
369 dh.uid = make_uid();
370 dh.appnum = 1;
371 dh.key_size = 32;
372
373 native_file df;
374 error_code ec;
375 df.create(file_mode::append, dp, ec);
376 if (ec)
377 Throw<nudb::system_error>(ec);
378 bulk_writer<native_file> dw(df, 0, bulk_size);
379 {
380 {
381 auto os = dw.prepare(dat_file_header::size, ec);
382 if (ec)
383 Throw<nudb::system_error>(ec);
384 write(os, dh);
385 }
386 rocksdb::ReadOptions options;
387 options.verify_checksums = false;
388 options.fill_cache = false;
389 std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(options));
390
391 buffer buf;
392 for (it->SeekToFirst(); it->Valid(); it->Next())
393 {
394 if (it->key().size() != 32)
395 Throw<std::runtime_error>(
396 "Unexpected key size " +
397 std::to_string(it->key().size()));
398 void const* const key = it->key().data();
399 void const* const data = it->value().data();
400 auto const size = it->value().size();
401 std::unique_ptr<char[]> clean(new char[size]);
402 std::memcpy(clean.get(), data, size);
403 filter_inner(clean.get(), size);
404 auto const out = nodeobject_compress(clean.get(), size, buf);
405 // Verify codec correctness
406 {
407 buffer buf2;
408 auto const check =
409 nodeobject_decompress(out.first, out.second, buf2);
410 BEAST_EXPECT(check.second == size);
411 BEAST_EXPECT(
412 std::memcmp(check.first, clean.get(), size) == 0);
413 }
414 // Data Record
415 auto os = dw.prepare(
416 field<uint48_t>::size + // Size
417 32 + // Key
418 out.second,
419 ec);
420 if (ec)
421 Throw<nudb::system_error>(ec);
422 write<uint48_t>(os, out.second);
423 std::memcpy(os.data(32), key, 32);
424 std::memcpy(os.data(out.second), out.first, out.second);
425 ++nitems;
426 }
427 dw.flush(ec);
428 if (ec)
429 Throw<nudb::system_error>(ec);
430 }
431 db.reset();
432 log << "Import data: "
433 << detail::fmtdur(std::chrono::steady_clock::now() - start);
434 auto const df_size = df.size(ec);
435 if (ec)
436 Throw<nudb::system_error>(ec);
437 // Create key file
438 key_file_header kh;
439 kh.version = currentVersion;
440 kh.uid = dh.uid;
441 kh.appnum = dh.appnum;
442 kh.key_size = 32;
443 kh.salt = make_salt();
444 kh.pepper = pepper<hash_type>(kh.salt);
445 kh.block_size = block_size(kp);
446 kh.load_factor = std::min<std::size_t>(65536.0 * load_factor, 65535);
447 kh.buckets =
448 std::ceil(nitems / (bucket_capacity(kh.block_size) * load_factor));
449 kh.modulus = ceil_pow2(kh.buckets);
450 native_file kf;
451 kf.create(file_mode::append, kp, ec);
452 if (ec)
453 Throw<nudb::system_error>(ec);
454 buffer buf(kh.block_size);
455 {
456 std::memset(buf.get(), 0, kh.block_size);
457 ostream os(buf.get(), kh.block_size);
458 write(os, kh);
459 kf.write(0, buf.get(), kh.block_size, ec);
460 if (ec)
461 Throw<nudb::system_error>(ec);
462 }
463 // Build contiguous sequential sections of the
464 // key file using multiple passes over the data.
465 //
466 auto const buckets =
467 std::max<std::size_t>(1, buffer_size / kh.block_size);
468 buf.reserve(buckets * kh.block_size);
469 auto const passes = (kh.buckets + buckets - 1) / buckets;
470 log << "items: " << nitems
471 << "\n"
472 "buckets: "
473 << kh.buckets
474 << "\n"
475 "data: "
476 << df_size
477 << "\n"
478 "passes: "
479 << passes;
480 progress p(df_size * passes);
481 std::size_t npass = 0;
482 for (std::size_t b0 = 0; b0 < kh.buckets; b0 += buckets)
483 {
484 auto const b1 = std::min(b0 + buckets, kh.buckets);
485 // Buffered range is [b0, b1)
486 auto const bn = b1 - b0;
487 // Create empty buckets
488 for (std::size_t i = 0; i < bn; ++i)
489 {
490 bucket b(kh.block_size, buf.get() + i * kh.block_size, empty);
491 }
492 // Insert all keys into buckets
493 // Iterate Data File
494 bulk_reader<native_file> r(
495 df, dat_file_header::size, df_size, bulk_size);
496 while (!r.eof())
497 {
498 auto const offset = r.offset();
499 // Data Record or Spill Record
501 auto is = r.prepare(field<uint48_t>::size, ec); // Size
502 if (ec)
503 Throw<nudb::system_error>(ec);
504 read<uint48_t>(is, size);
505 if (size > 0)
506 {
507 // Data Record
508 is = r.prepare(
509 dh.key_size + // Key
510 size,
511 ec); // Data
512 if (ec)
513 Throw<nudb::system_error>(ec);
514 std::uint8_t const* const key = is.data(dh.key_size);
515 auto const h = hash<hash_type>(key, kh.key_size, kh.salt);
516 auto const n = bucket_index(h, kh.buckets, kh.modulus);
517 p(log, npass * df_size + r.offset());
518 if (n < b0 || n >= b1)
519 continue;
520 bucket b(
521 kh.block_size, buf.get() + (n - b0) * kh.block_size);
522 maybe_spill(b, dw, ec);
523 if (ec)
524 Throw<nudb::system_error>(ec);
525 b.insert(offset, size, h);
526 }
527 else
528 {
529 // VFALCO Should never get here
530 // Spill Record
531 is = r.prepare(field<std::uint16_t>::size, ec);
532 if (ec)
533 Throw<nudb::system_error>(ec);
534 read<std::uint16_t>(is, size); // Size
535 r.prepare(size, ec); // skip
536 if (ec)
537 Throw<nudb::system_error>(ec);
538 }
539 }
540 kf.write(
541 (b0 + 1) * kh.block_size, buf.get(), bn * kh.block_size, ec);
542 if (ec)
543 Throw<nudb::system_error>(ec);
544 ++npass;
545 }
546 dw.flush(ec);
547 if (ec)
548 Throw<nudb::system_error>(ec);
549 p.finish(log);
550 }
551};
552
553BEAST_DEFINE_TESTSUITE_MANUAL(import, nodestore, ripple);
554
555#endif
556
557//------------------------------------------------------------------------------
558
559} // namespace NodeStore
560} // namespace ripple
T begin(T... args)
T ceil(T... args)
A clock whose minimum resolution is one second.
typename Clock::duration duration
typename Clock::time_point time_point
A testsuite class.
Definition suite.h:55
void operator()(Log &log, std::size_t work)
clock_type::time_point start_
clock_type::time_point report_
clock_type::time_point now_
save_stream_state & operator=(save_stream_state const &)=delete
save_stream_state(save_stream_state const &)=delete
T emplace(T... args)
T end(T... args)
T fill(T... args)
T fixed(T... args)
T flags(T... args)
T is_same_v
T log(T... args)
T memcmp(T... args)
T memcpy(T... args)
T memset(T... args)
T min(T... args)
void check(bool condition, std::string const &message)
Result split(FwdIt first, FwdIt last, Char delim)
Parse a character sequence of values separated by commas.
Definition rfc2616.h:123
std::map< std::string, std::string, boost::beast::iless > parse_args(std::string const &s)
void filter_inner(void *in, std::size_t in_size)
Definition codec.h:318
void write(nudb::detail::ostream &os, std::size_t t)
Definition varint.h:134
std::pair< void const *, std::size_t > nodeobject_decompress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition codec.h:111
std::pair< void const *, std::size_t > nodeobject_compress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition codec.h:222
std::string fmtdur(std::chrono::duration< Period, Rep > const &d)
std::ostream & pretty_time(std::ostream &os, std::chrono::duration< Rep, Period > d)
auto const data
General field definitions, or fields used in multiple transaction namespaces.
Use hash_* containers for keys that do not need a cryptographically secure hashing algorithm.
Definition algorithm.h:25
int run(int argc, char **argv)
Definition Main.cpp:349
T precision(T... args)
T reset(T... args)
T setprecision(T... args)
T size(T... args)
T stoull(T... args)
T str(T... args)
T to_string(T... args)