rippled
Loading...
Searching...
No Matches
import_test.cpp
1#include <xrpl/basics/contract.h>
2#include <xrpl/basics/rocksdb.h>
3#include <xrpl/beast/clock/basic_seconds_clock.h>
4#include <xrpl/beast/core/LexicalCast.h>
5#include <xrpl/beast/rfc2616.h>
6#include <xrpl/beast/unit_test.h>
7#include <xrpl/nodestore/detail/codec.h>
8
9#include <boost/beast/core/string.hpp>
10#include <boost/regex.hpp>
11
12#include <nudb/create.hpp>
13#include <nudb/detail/format.hpp>
14#include <nudb/xxhasher.hpp>
15
16#include <algorithm>
17#include <chrono>
18#include <iomanip>
19#include <map>
20#include <sstream>
21
22/*
23
24Math:
25
261000 gb dat file
27170 gb key file
28capacity 113 keys/bucket
29
30normal:
311,000gb data file read
3219,210gb key file read (113 * 170)
3319,210gb key file write
34
35multi(32gb):
366 passes (170/32)
376,000gb data file read
38170gb key file write
39
40
41*/
42
43namespace xrpl {
44
45namespace detail {
46
48{
51 std::ios::fmtflags flags_;
52 std::ios::char_type fill_;
53
54public:
63 operator=(save_stream_state const&) = delete;
65 : os_(os), precision_(os.precision()), flags_(os.flags()), fill_(os.fill())
66 {
67 }
68};
69
70template <class Rep, class Period>
73{
75 using namespace std::chrono;
76 if (d < microseconds{1})
77 {
78 // use nanoseconds
79 if (d < nanoseconds{100})
80 {
81 // use floating
83 os << std::fixed << std::setprecision(1) << ns(d).count();
84 }
85 else
86 {
87 // use integral
88 os << round<nanoseconds>(d).count();
89 }
90 os << "ns";
91 }
92 else if (d < milliseconds{1})
93 {
94 // use microseconds
95 if (d < microseconds{100})
96 {
97 // use floating
99 os << std::fixed << std::setprecision(1) << ms(d).count();
100 }
101 else
102 {
103 // use integral
104 os << round<microseconds>(d).count();
105 }
106 os << "us";
107 }
108 else if (d < seconds{1})
109 {
110 // use milliseconds
111 if (d < milliseconds{100})
112 {
113 // use floating
115 os << std::fixed << std::setprecision(1) << ms(d).count();
116 }
117 else
118 {
119 // use integral
120 os << round<milliseconds>(d).count();
121 }
122 os << "ms";
123 }
124 else if (d < minutes{1})
125 {
126 // use seconds
127 if (d < seconds{100})
128 {
129 // use floating
130 using s = duration<float>;
131 os << std::fixed << std::setprecision(1) << s(d).count();
132 }
133 else
134 {
135 // use integral
136 os << round<seconds>(d).count();
137 }
138 os << "s";
139 }
140 else
141 {
142 // use minutes
143 if (d < minutes{100})
144 {
145 // use floating
147 os << std::fixed << std::setprecision(1) << m(d).count();
148 }
149 else
150 {
151 // use integral
152 os << round<minutes>(d).count();
153 }
154 os << "min";
155 }
156 return os;
157}
158
159template <class Period, class Rep>
160inline std::string
162{
164 pretty_time(ss, d);
165 return ss.str();
166}
167
168} // namespace detail
169
170namespace NodeStore {
171
172//------------------------------------------------------------------------------
173
175{
176private:
178
184 bool estimate_ = false;
185
186public:
187 explicit progress(std::size_t work) : work_(work)
188 {
189 }
190
191 template <class Log>
192 void
193 operator()(Log& log, std::size_t work)
194 {
195 using namespace std::chrono;
196 auto const now = clock_type::now();
197 if (now == now_)
198 return;
199 now_ = now;
200 auto const elapsed = now - start_;
201 if (!estimate_)
202 {
203 if (elapsed < seconds(15))
204 return;
205 estimate_ = true;
206 }
207 else if (now - report_ < std::chrono::seconds(60))
208 {
209 return;
210 }
211 auto const rate = elapsed.count() / double(work);
212 clock_type::duration const remain(static_cast<clock_type::duration::rep>((work_ - work) * rate));
213 log << "Remaining: " << detail::fmtdur(remain) << " (" << work << " of " << work_ << " in "
214 << detail::fmtdur(elapsed) << ", " << (work - prev_) << " in " << detail::fmtdur(now - report_) << ")";
215 report_ = now;
216 prev_ = work;
217 }
218
219 template <class Log>
220 void
221 finish(Log& log)
222 {
223 log << "Total time: " << detail::fmtdur(clock_type::now() - start_);
224 }
225};
226
229{
230 // <key> '=' <value>
231 static boost::regex const re1(
232 "^" // start of line
233 "(?:\\s*)" // whitespace (optional)
234 "([a-zA-Z][_a-zA-Z0-9]*)" // <key>
235 "(?:\\s*)" // whitespace (optional)
236 "(?:=)" // '='
237 "(?:\\s*)" // whitespace (optional)
238 "(.*\\S+)" // <value>
239 "(?:\\s*)" // whitespace (optional)
240 ,
241 boost::regex_constants::optimize);
243 auto const v = beast::rfc2616::split(s.begin(), s.end(), ',');
244 for (auto const& kv : v)
245 {
246 boost::smatch m;
247 if (!boost::regex_match(kv, m, re1))
248 Throw<std::runtime_error>("invalid parameter " + kv);
249 auto const result = map.emplace(m[1], m[2]);
250 if (!result.second)
251 Throw<std::runtime_error>("duplicate parameter " + m[1]);
252 }
253 return map;
254}
255
256//------------------------------------------------------------------------------
257
258#if XRPL_ROCKSDB_AVAILABLE
259
260class import_test : public beast::unit_test::suite
261{
262public:
263 void
264 run() override
265 {
266 testcase(beast::unit_test::abort_on_fail) << arg();
267
268 using namespace nudb;
269 using namespace nudb::detail;
270
271 pass();
272 auto const args = parse_args(arg());
273 bool usage = args.empty();
274
275 if (!usage && args.find("from") == args.end())
276 {
277 log << "Missing parameter: from";
278 usage = true;
279 }
280 if (!usage && args.find("to") == args.end())
281 {
282 log << "Missing parameter: to";
283 usage = true;
284 }
285 if (!usage && args.find("buffer") == args.end())
286 {
287 log << "Missing parameter: buffer";
288 usage = true;
289 }
290
291 if (usage)
292 {
293 log << "Usage:\n"
294 << "--unittest-arg=from=<from>,to=<to>,buffer=<buffer>\n"
295 << "from: RocksDB database to import from\n"
296 << "to: NuDB database to import to\n"
297 << "buffer: Buffer size (bigger is faster)\n"
298 << "NuDB database must not already exist.";
299 return;
300 }
301
302 // This controls the size of the bucket buffer.
303 // For a 1TB data file, a 32GB bucket buffer is suggested.
304 // The larger the buffer, the faster the import.
305 //
306 std::size_t const buffer_size = std::stoull(args.at("buffer"));
307 auto const from_path = args.at("from");
308 auto const to_path = args.at("to");
309
310 using hash_type = nudb::xxhasher;
311 auto const bulk_size = 64 * 1024 * 1024;
312 float const load_factor = 0.5;
313
314 auto const dp = to_path + ".dat";
315 auto const kp = to_path + ".key";
316
317 auto const start = std::chrono::steady_clock::now();
318
319 log << "from: " << from_path
320 << "\n"
321 "to: "
322 << to_path
323 << "\n"
324 "buffer: "
325 << buffer_size;
326
328 {
329 rocksdb::Options options;
330 options.create_if_missing = false;
331 options.max_open_files = 2000; // 5000?
332 rocksdb::DB* pdb = nullptr;
333 rocksdb::Status status = rocksdb::DB::OpenForReadOnly(options, from_path, &pdb);
334 if (!status.ok() || !pdb)
335 Throw<std::runtime_error>("Can't open '" + from_path + "': " + status.ToString());
336 db.reset(pdb);
337 }
338 // Create data file with values
339 std::size_t nitems = 0;
340 dat_file_header dh;
341 dh.version = currentVersion;
342 dh.uid = make_uid();
343 dh.appnum = 1;
344 dh.key_size = 32;
345
346 native_file df;
347 error_code ec;
348 df.create(file_mode::append, dp, ec);
349 if (ec)
350 Throw<nudb::system_error>(ec);
351 bulk_writer<native_file> dw(df, 0, bulk_size);
352 {
353 {
354 auto os = dw.prepare(dat_file_header::size, ec);
355 if (ec)
356 Throw<nudb::system_error>(ec);
357 write(os, dh);
358 }
359 rocksdb::ReadOptions options;
360 options.verify_checksums = false;
361 options.fill_cache = false;
362 std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(options));
363
364 buffer buf;
365 for (it->SeekToFirst(); it->Valid(); it->Next())
366 {
367 if (it->key().size() != 32)
368 Throw<std::runtime_error>("Unexpected key size " + std::to_string(it->key().size()));
369 void const* const key = it->key().data();
370 void const* const data = it->value().data();
371 auto const size = it->value().size();
372 std::unique_ptr<char[]> clean(new char[size]);
373 std::memcpy(clean.get(), data, size);
374 filter_inner(clean.get(), size);
375 auto const out = nodeobject_compress(clean.get(), size, buf);
376 // Verify codec correctness
377 {
378 buffer buf2;
379 auto const check = nodeobject_decompress(out.first, out.second, buf2);
380 BEAST_EXPECT(check.second == size);
381 BEAST_EXPECT(std::memcmp(check.first, clean.get(), size) == 0);
382 }
383 // Data Record
384 auto os = dw.prepare(
385 field<uint48_t>::size + // Size
386 32 + // Key
387 out.second,
388 ec);
389 if (ec)
390 Throw<nudb::system_error>(ec);
391 write<uint48_t>(os, out.second);
392 std::memcpy(os.data(32), key, 32);
393 std::memcpy(os.data(out.second), out.first, out.second);
394 ++nitems;
395 }
396 dw.flush(ec);
397 if (ec)
398 Throw<nudb::system_error>(ec);
399 }
400 db.reset();
401 log << "Import data: " << detail::fmtdur(std::chrono::steady_clock::now() - start);
402 auto const df_size = df.size(ec);
403 if (ec)
404 Throw<nudb::system_error>(ec);
405 // Create key file
406 key_file_header kh;
407 kh.version = currentVersion;
408 kh.uid = dh.uid;
409 kh.appnum = dh.appnum;
410 kh.key_size = 32;
411 kh.salt = make_salt();
412 kh.pepper = pepper<hash_type>(kh.salt);
413 kh.block_size = block_size(kp);
414 kh.load_factor = std::min<std::size_t>(65536.0 * load_factor, 65535);
415 kh.buckets = std::ceil(nitems / (bucket_capacity(kh.block_size) * load_factor));
416 kh.modulus = ceil_pow2(kh.buckets);
417 native_file kf;
418 kf.create(file_mode::append, kp, ec);
419 if (ec)
420 Throw<nudb::system_error>(ec);
421 buffer buf(kh.block_size);
422 {
423 std::memset(buf.get(), 0, kh.block_size);
424 ostream os(buf.get(), kh.block_size);
425 write(os, kh);
426 kf.write(0, buf.get(), kh.block_size, ec);
427 if (ec)
428 Throw<nudb::system_error>(ec);
429 }
430 // Build contiguous sequential sections of the
431 // key file using multiple passes over the data.
432 //
433 auto const buckets = std::max<std::size_t>(1, buffer_size / kh.block_size);
434 buf.reserve(buckets * kh.block_size);
435 auto const passes = (kh.buckets + buckets - 1) / buckets;
436 log << "items: " << nitems
437 << "\n"
438 "buckets: "
439 << kh.buckets
440 << "\n"
441 "data: "
442 << df_size
443 << "\n"
444 "passes: "
445 << passes;
446 progress p(df_size * passes);
447 std::size_t npass = 0;
448 for (std::size_t b0 = 0; b0 < kh.buckets; b0 += buckets)
449 {
450 auto const b1 = std::min(b0 + buckets, kh.buckets);
451 // Buffered range is [b0, b1)
452 auto const bn = b1 - b0;
453 // Create empty buckets
454 for (std::size_t i = 0; i < bn; ++i)
455 {
456 bucket b(kh.block_size, buf.get() + i * kh.block_size, empty);
457 }
458 // Insert all keys into buckets
459 // Iterate Data File
460 bulk_reader<native_file> r(df, dat_file_header::size, df_size, bulk_size);
461 while (!r.eof())
462 {
463 auto const offset = r.offset();
464 // Data Record or Spill Record
466 auto is = r.prepare(field<uint48_t>::size, ec); // Size
467 if (ec)
468 Throw<nudb::system_error>(ec);
469 read<uint48_t>(is, size);
470 if (size > 0)
471 {
472 // Data Record
473 is = r.prepare(
474 dh.key_size + // Key
475 size,
476 ec); // Data
477 if (ec)
478 Throw<nudb::system_error>(ec);
479 std::uint8_t const* const key = is.data(dh.key_size);
480 auto const h = hash<hash_type>(key, kh.key_size, kh.salt);
481 auto const n = bucket_index(h, kh.buckets, kh.modulus);
482 p(log, npass * df_size + r.offset());
483 if (n < b0 || n >= b1)
484 continue;
485 bucket b(kh.block_size, buf.get() + (n - b0) * kh.block_size);
486 maybe_spill(b, dw, ec);
487 if (ec)
488 Throw<nudb::system_error>(ec);
489 b.insert(offset, size, h);
490 }
491 else
492 {
493 // VFALCO Should never get here
494 // Spill Record
495 is = r.prepare(field<std::uint16_t>::size, ec);
496 if (ec)
497 Throw<nudb::system_error>(ec);
498 read<std::uint16_t>(is, size); // Size
499 r.prepare(size, ec); // skip
500 if (ec)
501 Throw<nudb::system_error>(ec);
502 }
503 }
504 kf.write((b0 + 1) * kh.block_size, buf.get(), bn * kh.block_size, ec);
505 if (ec)
506 Throw<nudb::system_error>(ec);
507 ++npass;
508 }
509 dw.flush(ec);
510 if (ec)
511 Throw<nudb::system_error>(ec);
512 p.finish(log);
513 }
514};
515
516BEAST_DEFINE_TESTSUITE_MANUAL(import, nodestore, xrpl);
517
518#endif
519
520//------------------------------------------------------------------------------
521
522} // namespace NodeStore
523} // namespace xrpl
T begin(T... args)
T ceil(T... args)
A clock whose minimum resolution is one second.
typename Clock::duration duration
typename Clock::time_point time_point
A testsuite class.
Definition suite.h:51
clock_type::time_point report_
clock_type::time_point start_
progress(std::size_t work)
void operator()(Log &log, std::size_t work)
std::size_t const work_
clock_type::time_point now_
save_stream_state(std::ostream &os)
save_stream_state(save_stream_state const &)=delete
save_stream_state & operator=(save_stream_state const &)=delete
T emplace(T... args)
T end(T... args)
T fill(T... args)
T fixed(T... args)
T flags(T... args)
T is_same_v
T log(T... args)
T memcmp(T... args)
T memcpy(T... args)
T memset(T... args)
T min(T... args)
void check(bool condition, std::string const &message)
Result split(FwdIt first, FwdIt last, Char delim)
Parse a character sequence of values separated by commas.
Definition rfc2616.h:101
std::pair< void const *, std::size_t > nodeobject_compress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition codec.h:187
void filter_inner(void *in, std::size_t in_size)
Definition codec.h:279
void write(nudb::detail::ostream &os, std::size_t t)
Definition varint.h:114
std::pair< void const *, std::size_t > nodeobject_decompress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition codec.h:86
std::map< std::string, std::string, boost::beast::iless > parse_args(std::string const &s)
std::string fmtdur(std::chrono::duration< Period, Rep > const &d)
std::ostream & pretty_time(std::ostream &os, std::chrono::duration< Rep, Period > d)
auto const data
General field definitions, or fields used in multiple transaction namespaces.
Use hash_* containers for keys that do not need a cryptographically secure hashing algorithm.
Definition algorithm.h:5
int run(int argc, char **argv)
Definition Main.cpp:322
T precision(T... args)
T reset(T... args)
T setprecision(T... args)
T size(T... args)
T stoull(T... args)
T str(T... args)
T to_string(T... args)