rippled
Loading...
Searching...
No Matches
import_test.cpp
1//------------------------------------------------------------------------------
2/*
3 This file is part of rippled: https://github.com/ripple/rippled
4 Copyright (c) 2012, 2013 Ripple Labs Inc.
5
6 Permission to use, copy, modify, and/or distribute this software for any
7 purpose with or without fee is hereby granted, provided that the above
8 copyright notice and this permission notice appear in all copies.
9
10 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 ANY SPECIAL , DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17*/
18//==============================================================================
19
20#include <xrpld/nodestore/detail/codec.h>
21#include <xrpld/unity/rocksdb.h>
22#include <xrpl/basics/contract.h>
23#include <xrpl/beast/clock/basic_seconds_clock.h>
24#include <xrpl/beast/core/LexicalCast.h>
25#include <xrpl/beast/rfc2616.h>
26#include <xrpl/beast/unit_test.h>
27
28#include <boost/beast/core/string.hpp>
29#include <boost/regex.hpp>
30#include <nudb/create.hpp>
31#include <nudb/detail/format.hpp>
32#include <nudb/xxhasher.hpp>
33
34#include <algorithm>
35#include <chrono>
36#include <iomanip>
37#include <map>
38#include <sstream>
39
40/*
41
42Math:
43
441000 gb dat file
45170 gb key file
46capacity 113 keys/bucket
47
48normal:
491,000gb data file read
5019,210gb key file read (113 * 170)
5119,210gb key file write
52
53multi(32gb):
546 passes (170/32)
556,000gb data file read
56170gb key file write
57
58
59*/
60
61namespace ripple {
62namespace NodeStore {
63
64namespace detail {
65
67{
70 std::ios::fmtflags flags_;
71 std::ios::char_type fill_;
72
73public:
75 {
78 os_.fill(fill_);
79 }
82 operator=(save_stream_state const&) = delete;
84 : os_(os)
85 , precision_(os.precision())
86 , flags_(os.flags())
87 , fill_(os.fill())
88 {
89 }
90};
91
92template <class Rep, class Period>
95{
97 using namespace std::chrono;
98 if (d < microseconds{1})
99 {
100 // use nanoseconds
101 if (d < nanoseconds{100})
102 {
103 // use floating
105 os << std::fixed << std::setprecision(1) << ns(d).count();
106 }
107 else
108 {
109 // use integral
110 os << round<nanoseconds>(d).count();
111 }
112 os << "ns";
113 }
114 else if (d < milliseconds{1})
115 {
116 // use microseconds
117 if (d < microseconds{100})
118 {
119 // use floating
121 os << std::fixed << std::setprecision(1) << ms(d).count();
122 }
123 else
124 {
125 // use integral
126 os << round<microseconds>(d).count();
127 }
128 os << "us";
129 }
130 else if (d < seconds{1})
131 {
132 // use milliseconds
133 if (d < milliseconds{100})
134 {
135 // use floating
137 os << std::fixed << std::setprecision(1) << ms(d).count();
138 }
139 else
140 {
141 // use integral
142 os << round<milliseconds>(d).count();
143 }
144 os << "ms";
145 }
146 else if (d < minutes{1})
147 {
148 // use seconds
149 if (d < seconds{100})
150 {
151 // use floating
152 using s = duration<float>;
153 os << std::fixed << std::setprecision(1) << s(d).count();
154 }
155 else
156 {
157 // use integral
158 os << round<seconds>(d).count();
159 }
160 os << "s";
161 }
162 else
163 {
164 // use minutes
165 if (d < minutes{100})
166 {
167 // use floating
169 os << std::fixed << std::setprecision(1) << m(d).count();
170 }
171 else
172 {
173 // use integral
174 os << round<minutes>(d).count();
175 }
176 os << "min";
177 }
178 return os;
179}
180
181template <class Period, class Rep>
182inline std::string
184{
186 pretty_time(ss, d);
187 return ss.str();
188}
189
190} // namespace detail
191
192//------------------------------------------------------------------------------
193
195{
196private:
198
204 bool estimate_ = false;
205
206public:
207 explicit progress(std::size_t work) : work_(work)
208 {
209 }
210
211 template <class Log>
212 void
213 operator()(Log& log, std::size_t work)
214 {
215 using namespace std::chrono;
216 auto const now = clock_type::now();
217 if (now == now_)
218 return;
219 now_ = now;
220 auto const elapsed = now - start_;
221 if (!estimate_)
222 {
223 if (elapsed < seconds(15))
224 return;
225 estimate_ = true;
226 }
227 else if (now - report_ < std::chrono::seconds(60))
228 {
229 return;
230 }
231 auto const rate = elapsed.count() / double(work);
232 clock_type::duration const remain(
233 static_cast<clock_type::duration::rep>((work_ - work) * rate));
234 log << "Remaining: " << detail::fmtdur(remain) << " (" << work << " of "
235 << work_ << " in " << detail::fmtdur(elapsed) << ", "
236 << (work - prev_) << " in " << detail::fmtdur(now - report_) << ")";
237 report_ = now;
238 prev_ = work;
239 }
240
241 template <class Log>
242 void
243 finish(Log& log)
244 {
245 log << "Total time: " << detail::fmtdur(clock_type::now() - start_);
246 }
247};
248
251{
252 // <key> '=' <value>
253 static boost::regex const re1(
254 "^" // start of line
255 "(?:\\s*)" // whitespace (optonal)
256 "([a-zA-Z][_a-zA-Z0-9]*)" // <key>
257 "(?:\\s*)" // whitespace (optional)
258 "(?:=)" // '='
259 "(?:\\s*)" // whitespace (optional)
260 "(.*\\S+)" // <value>
261 "(?:\\s*)" // whitespace (optional)
262 ,
263 boost::regex_constants::optimize);
265 auto const v = beast::rfc2616::split(s.begin(), s.end(), ',');
266 for (auto const& kv : v)
267 {
268 boost::smatch m;
269 if (!boost::regex_match(kv, m, re1))
270 Throw<std::runtime_error>("invalid parameter " + kv);
271 auto const result = map.emplace(m[1], m[2]);
272 if (!result.second)
273 Throw<std::runtime_error>("duplicate parameter " + m[1]);
274 }
275 return map;
276}
277
278//------------------------------------------------------------------------------
279
280#if RIPPLE_ROCKSDB_AVAILABLE
281
282class import_test : public beast::unit_test::suite
283{
284public:
285 void
286 run() override
287 {
288 testcase(beast::unit_test::abort_on_fail) << arg();
289
290 using namespace nudb;
291 using namespace nudb::detail;
292
293 pass();
294 auto const args = parse_args(arg());
295 bool usage = args.empty();
296
297 if (!usage && args.find("from") == args.end())
298 {
299 log << "Missing parameter: from";
300 usage = true;
301 }
302 if (!usage && args.find("to") == args.end())
303 {
304 log << "Missing parameter: to";
305 usage = true;
306 }
307 if (!usage && args.find("buffer") == args.end())
308 {
309 log << "Missing parameter: buffer";
310 usage = true;
311 }
312
313 if (usage)
314 {
315 log << "Usage:\n"
316 << "--unittest-arg=from=<from>,to=<to>,buffer=<buffer>\n"
317 << "from: RocksDB database to import from\n"
318 << "to: NuDB database to import to\n"
319 << "buffer: Buffer size (bigger is faster)\n"
320 << "NuDB database must not already exist.";
321 return;
322 }
323
324 // This controls the size of the bucket buffer.
325 // For a 1TB data file, a 32GB bucket buffer is suggested.
326 // The larger the buffer, the faster the import.
327 //
328 std::size_t const buffer_size = std::stoull(args.at("buffer"));
329 auto const from_path = args.at("from");
330 auto const to_path = args.at("to");
331
332 using hash_type = nudb::xxhasher;
333 auto const bulk_size = 64 * 1024 * 1024;
334 float const load_factor = 0.5;
335
336 auto const dp = to_path + ".dat";
337 auto const kp = to_path + ".key";
338
339 auto const start = std::chrono::steady_clock::now();
340
341 log << "from: " << from_path
342 << "\n"
343 "to: "
344 << to_path
345 << "\n"
346 "buffer: "
347 << buffer_size;
348
350 {
351 rocksdb::Options options;
352 options.create_if_missing = false;
353 options.max_open_files = 2000; // 5000?
354 rocksdb::DB* pdb = nullptr;
355 rocksdb::Status status =
356 rocksdb::DB::OpenForReadOnly(options, from_path, &pdb);
357 if (!status.ok() || !pdb)
358 Throw<std::runtime_error>(
359 "Can't open '" + from_path + "': " + status.ToString());
360 db.reset(pdb);
361 }
362 // Create data file with values
363 std::size_t nitems = 0;
364 dat_file_header dh;
365 dh.version = currentVersion;
366 dh.uid = make_uid();
367 dh.appnum = 1;
368 dh.key_size = 32;
369
370 native_file df;
371 error_code ec;
372 df.create(file_mode::append, dp, ec);
373 if (ec)
374 Throw<nudb::system_error>(ec);
375 bulk_writer<native_file> dw(df, 0, bulk_size);
376 {
377 {
378 auto os = dw.prepare(dat_file_header::size, ec);
379 if (ec)
380 Throw<nudb::system_error>(ec);
381 write(os, dh);
382 }
383 rocksdb::ReadOptions options;
384 options.verify_checksums = false;
385 options.fill_cache = false;
386 std::unique_ptr<rocksdb::Iterator> it(db->NewIterator(options));
387
388 buffer buf;
389 for (it->SeekToFirst(); it->Valid(); it->Next())
390 {
391 if (it->key().size() != 32)
392 Throw<std::runtime_error>(
393 "Unexpected key size " +
394 std::to_string(it->key().size()));
395 void const* const key = it->key().data();
396 void const* const data = it->value().data();
397 auto const size = it->value().size();
398 std::unique_ptr<char[]> clean(new char[size]);
399 std::memcpy(clean.get(), data, size);
400 filter_inner(clean.get(), size);
401 auto const out = nodeobject_compress(clean.get(), size, buf);
402 // Verify codec correctness
403 {
404 buffer buf2;
405 auto const check =
406 nodeobject_decompress(out.first, out.second, buf2);
407 BEAST_EXPECT(check.second == size);
408 BEAST_EXPECT(
409 std::memcmp(check.first, clean.get(), size) == 0);
410 }
411 // Data Record
412 auto os = dw.prepare(
413 field<uint48_t>::size + // Size
414 32 + // Key
415 out.second,
416 ec);
417 if (ec)
418 Throw<nudb::system_error>(ec);
419 write<uint48_t>(os, out.second);
420 std::memcpy(os.data(32), key, 32);
421 std::memcpy(os.data(out.second), out.first, out.second);
422 ++nitems;
423 }
424 dw.flush(ec);
425 if (ec)
426 Throw<nudb::system_error>(ec);
427 }
428 db.reset();
429 log << "Import data: "
430 << detail::fmtdur(std::chrono::steady_clock::now() - start);
431 auto const df_size = df.size(ec);
432 if (ec)
433 Throw<nudb::system_error>(ec);
434 // Create key file
435 key_file_header kh;
436 kh.version = currentVersion;
437 kh.uid = dh.uid;
438 kh.appnum = dh.appnum;
439 kh.key_size = 32;
440 kh.salt = make_salt();
441 kh.pepper = pepper<hash_type>(kh.salt);
442 kh.block_size = block_size(kp);
443 kh.load_factor = std::min<std::size_t>(65536.0 * load_factor, 65535);
444 kh.buckets =
445 std::ceil(nitems / (bucket_capacity(kh.block_size) * load_factor));
446 kh.modulus = ceil_pow2(kh.buckets);
447 native_file kf;
448 kf.create(file_mode::append, kp, ec);
449 if (ec)
450 Throw<nudb::system_error>(ec);
451 buffer buf(kh.block_size);
452 {
453 std::memset(buf.get(), 0, kh.block_size);
454 ostream os(buf.get(), kh.block_size);
455 write(os, kh);
456 kf.write(0, buf.get(), kh.block_size, ec);
457 if (ec)
458 Throw<nudb::system_error>(ec);
459 }
460 // Build contiguous sequential sections of the
461 // key file using multiple passes over the data.
462 //
463 auto const buckets =
464 std::max<std::size_t>(1, buffer_size / kh.block_size);
465 buf.reserve(buckets * kh.block_size);
466 auto const passes = (kh.buckets + buckets - 1) / buckets;
467 log << "items: " << nitems
468 << "\n"
469 "buckets: "
470 << kh.buckets
471 << "\n"
472 "data: "
473 << df_size
474 << "\n"
475 "passes: "
476 << passes;
477 progress p(df_size * passes);
478 std::size_t npass = 0;
479 for (std::size_t b0 = 0; b0 < kh.buckets; b0 += buckets)
480 {
481 auto const b1 = std::min(b0 + buckets, kh.buckets);
482 // Buffered range is [b0, b1)
483 auto const bn = b1 - b0;
484 // Create empty buckets
485 for (std::size_t i = 0; i < bn; ++i)
486 {
487 bucket b(kh.block_size, buf.get() + i * kh.block_size, empty);
488 }
489 // Insert all keys into buckets
490 // Iterate Data File
491 bulk_reader<native_file> r(
492 df, dat_file_header::size, df_size, bulk_size);
493 while (!r.eof())
494 {
495 auto const offset = r.offset();
496 // Data Record or Spill Record
498 auto is = r.prepare(field<uint48_t>::size, ec); // Size
499 if (ec)
500 Throw<nudb::system_error>(ec);
501 read<uint48_t>(is, size);
502 if (size > 0)
503 {
504 // Data Record
505 is = r.prepare(
506 dh.key_size + // Key
507 size,
508 ec); // Data
509 if (ec)
510 Throw<nudb::system_error>(ec);
511 std::uint8_t const* const key = is.data(dh.key_size);
512 auto const h = hash<hash_type>(key, kh.key_size, kh.salt);
513 auto const n = bucket_index(h, kh.buckets, kh.modulus);
514 p(log, npass * df_size + r.offset());
515 if (n < b0 || n >= b1)
516 continue;
517 bucket b(
518 kh.block_size, buf.get() + (n - b0) * kh.block_size);
519 maybe_spill(b, dw, ec);
520 if (ec)
521 Throw<nudb::system_error>(ec);
522 b.insert(offset, size, h);
523 }
524 else
525 {
526 // VFALCO Should never get here
527 // Spill Record
528 is = r.prepare(field<std::uint16_t>::size, ec);
529 if (ec)
530 Throw<nudb::system_error>(ec);
531 read<std::uint16_t>(is, size); // Size
532 r.prepare(size, ec); // skip
533 if (ec)
534 Throw<nudb::system_error>(ec);
535 }
536 }
537 kf.write(
538 (b0 + 1) * kh.block_size, buf.get(), bn * kh.block_size, ec);
539 if (ec)
540 Throw<nudb::system_error>(ec);
541 ++npass;
542 }
543 dw.flush(ec);
544 if (ec)
545 Throw<nudb::system_error>(ec);
546 p.finish(log);
547 }
548};
549
550BEAST_DEFINE_TESTSUITE_MANUAL(import, NodeStore, ripple);
551
552#endif
553
554//------------------------------------------------------------------------------
555
556} // namespace NodeStore
557} // namespace ripple
T begin(T... args)
T ceil(T... args)
A clock whose minimum resolution is one second.
typename Clock::duration duration
typename Clock::time_point time_point
A testsuite class.
Definition: suite.h:55
save_stream_state(save_stream_state const &)=delete
save_stream_state & operator=(save_stream_state const &)=delete
void operator()(Log &log, std::size_t work)
clock_type::time_point start_
clock_type::time_point report_
progress(std::size_t work)
clock_type::time_point now_
T data(T... args)
T emplace(T... args)
T end(T... args)
T fill(T... args)
T fixed(T... args)
T flags(T... args)
T log(T... args)
T memcmp(T... args)
T memcpy(T... args)
T memset(T... args)
T min(T... args)
void check(bool condition, std::string const &message)
Definition: json/Writer.h:253
Result split(FwdIt first, FwdIt last, Char delim)
Parse a character sequence of values separated by commas.
Definition: rfc2616.h:125
std::ostream & pretty_time(std::ostream &os, std::chrono::duration< Rep, Period > d)
Definition: import_test.cpp:94
std::string fmtdur(std::chrono::duration< Period, Rep > const &d)
std::map< std::string, std::string, boost::beast::iless > parse_args(std::string const &s)
void filter_inner(void *in, std::size_t in_size)
Definition: codec.h:316
void write(nudb::detail::ostream &os, std::size_t t)
Definition: varint.h:134
std::pair< void const *, std::size_t > nodeobject_decompress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:109
std::pair< void const *, std::size_t > nodeobject_compress(void const *in, std::size_t in_size, BufferFactory &&bf)
Definition: codec.h:220
Use hash_* containers for keys that do not need a cryptographically secure hashing algorithm.
Definition: algorithm.h:26
int run(int argc, char **argv)
Definition: Main.cpp:341
T precision(T... args)
T reset(T... args)
T setprecision(T... args)
T size(T... args)
T stoull(T... args)
T str(T... args)
T to_string(T... args)