rippled/bin/sidechain/python/log_report.py

#!/usr/bin/env python3

import argparse
from collections import defaultdict
import datetime
import json
import numpy as np
import os
import pandas as pd
import string
import sys
from typing import Dict, Set

from common import eprint
import log_analyzer


def _has_256bit_hex_field_other(data, result: Set[str]):
    return


_has_256bit_hex_field_overloads = defaultdict(
    lambda: _has_256bit_hex_field_other)


def _has_256bit_hex_field_str(data: str, result: Set[str]):
    if len(data) != 64:
        return
    for c in data:
        o = ord(c.upper())
        if ord('A') <= o <= ord('F'):
            continue
        if ord('0') <= o <= ord('9'):
            continue
        return
    result.add(data)


_has_256bit_hex_field_overloads[str] = _has_256bit_hex_field_str


def _has_256bit_hex_field_dict(data: dict, result: Set[str]):
    for k, v in data.items():
        if k in [
                "meta", "index", "LedgerIndex", "ledger_index", "ledger_hash",
                "SigningPubKey", "suppression"
        ]:
            continue
        _has_256bit_hex_field_overloads[type(v)](v, result)


_has_256bit_hex_field_overloads[dict] = _has_256bit_hex_field_dict


def _has_256bit_hex_field_list(data: list, result: Set[str]):
    for v in data:
        _has_256bit_hex_field_overloads[type(v)](v, result)


_has_256bit_hex_field_overloads[list] = _has_256bit_hex_field_list


def has_256bit_hex_field(data: dict) -> Set[str]:
    '''
    Find all the fields that are strings 64 chars long with only hex digits
    This is useful when grouping transactions by hex
    '''
    result = set()
    _has_256bit_hex_field_dict(data, result)
    return result


def group_by_txn(data: dict) -> dict:
    '''
    return a dictionary where the key is the transaction hash, the value is another dictionary.
    The second dictionary the key is the server id, and the values are a list of log items
    '''
    def _make_default():
        return defaultdict(lambda: list())

    result = defaultdict(_make_default)
    for server_id, log_list in data.items():
        for log_item in log_list:
            if txn_hashes := has_256bit_hex_field(log_item):
                for h in txn_hashes:
                    result[h][server_id].append(log_item)
    return result


def _rekey_dict_by_txn_date(hash_to_timestamp: dict,
                            grouped_by_txn: dict) -> dict:
    '''
    hash_to_timestamp is a dictionary with a key of the txn hash and a value of the timestamp.
    grouped_by_txn is a dictionary with a key of the txn and an unspecified value.
    the keys in hash_to_timestamp are a superset of the keys in grouped_by_txn
    This function returns a new grouped_by_txn dictionary with the transactions sorted by date.
    '''
    known_txns = [
        k for k, v in sorted(hash_to_timestamp.items(), key=lambda x: x[1])
    ]
    result = {}
    for k, v in grouped_by_txn.items():
        if k not in known_txns:
            result[k] = v
    for h in known_txns:
        result[h] = grouped_by_txn[h]
    return result


def _to_timestamp(str_time: str) -> datetime.datetime:
    return datetime.datetime.strptime(
        str_time.split('.')[0], "%Y-%b-%d %H:%M:%S")


class Report:
    def __init__(self, in_dir, out_dir):
        self.in_dir = in_dir
        self.out_dir = out_dir

        self.combined_logs_file_name = f'{self.out_dir}/combined_logs.json'
        self.grouped_by_txn_file_name = f'{self.out_dir}/grouped_by_txn.json'
        self.counts_by_txn_and_server_file_name = f'{self.out_dir}/counts_by_txn_and_server.org'
        self.data = None  # combined logs

        # grouped_by_txn is a dictionary where the key is the server id. mainchain servers
        # have a key of `mainchain_#` and sidechain servers have a key of
        # `sidechain_#`, where `#` is a number.
        self.grouped_by_txn = None

        if not os.path.isdir(in_dir):
            eprint(f'The input {self.in_dir} must be an existing directory')
            sys.exit(1)

        if os.path.exists(self.out_dir):
            if not os.path.isdir(self.out_dir):
                eprint(
                    f'The output: {self.out_dir} exists and is not a directory'
                )
                sys.exit(1)
        else:
            os.makedirs(self.out_dir)

        self.combine_logs()
        with open(self.combined_logs_file_name) as f:
            self.data = json.load(f)
        self.grouped_by_txn = group_by_txn(self.data)

        # counts_by_txn_and_server is a dictionary where the key is the txn_hash
        # and the value is a pandas df with a row for every server and a column for every message
        # the value is a count of how many times that message appears for that server.
        counts_by_txn_and_server = {}
        # dict where the key is a transaction hash and the value is the transaction
        hash_to_txn = {}
        # dict where the key is a transaction hash and the value is earliest timestamp in a log file
        hash_to_timestamp = {}
        for txn_hash, server_dict in self.grouped_by_txn.items():
            message_set = set()
            # message list is ordered by when it appears in the log
            message_list = []
            for server_id, messages in server_dict.items():
                for m in messages:
                    try:
                        d = m['data']
                        if 'msg' in d and 'transaction' in d['msg']:
                            t = d['msg']['transaction']
                        elif 'tx_json' in d:
                            t = d['tx_json']
                        if t['hash'] == txn_hash:
                            hash_to_txn[txn_hash] = t
                    except:
                        pass
                    msg = m['msg']
                    t = _to_timestamp(m['t'])
                    if txn_hash not in hash_to_timestamp:
                        hash_to_timestamp[txn_hash] = t
                    elif hash_to_timestamp[txn_hash] > t:
                        hash_to_timestamp[txn_hash] = t
                    if msg not in message_set:
                        message_set.add(msg)
                        message_list.append(msg)
            df = pd.DataFrame(0,
                              index=server_dict.keys(),
                              columns=message_list)
            for server_id, messages in server_dict.items():
                for m in messages:
                    df[m['msg']][server_id] += 1
            counts_by_txn_and_server[txn_hash] = df

        # sort the transactions by timestamp, but the txns with unknown timestamp at the beginning
        self.grouped_by_txn = _rekey_dict_by_txn_date(hash_to_timestamp,
                                                      self.grouped_by_txn)
        counts_by_txn_and_server = _rekey_dict_by_txn_date(
            hash_to_timestamp, counts_by_txn_and_server)

        with open(self.grouped_by_txn_file_name, 'w') as out:
            print(json.dumps(self.grouped_by_txn, indent=1), file=out)

        with open(self.counts_by_txn_and_server_file_name, 'w') as out:
            for txn_hash, df in counts_by_txn_and_server.items():
                print(f'\n\n* Txn: {txn_hash}', file=out)
                if txn_hash in hash_to_txn:
                    print(json.dumps(hash_to_txn[txn_hash], indent=1),
                          file=out)
                rename_dict = {}
                for column, renamed_column in zip(df.columns.array,
                                                  string.ascii_uppercase):
                    print(f'{renamed_column} = {column}', file=out)
                    rename_dict[column] = renamed_column
                df.rename(columns=rename_dict, inplace=True)
                print(f'\n{df}', file=out)

    def combine_logs(self):
        try:
            with open(self.combined_logs_file_name, "w") as out:
                log_analyzer.convert_all(args.input, out, pure_json=True)
        except Exception as e:
            eprint(f'Excption: {e}')
            raise e


def main(input_dir_name: str, output_dir_name: str):
    r = Report(input_dir_name, output_dir_name)

    # Values are a list of log lines formatted as json. There are five fields:
    # `t` is the timestamp.
    # `m` is the module.
    # `l` is the log level.
    # `msg` is the message.
    # `data` is the data.
    # For example:
    #
    # {
    #  "t": "2021-Oct-08 21:33:41.731371562 UTC",
    #  "m": "SidechainFederator",
    #  "l": "TRC",
    #  "msg": "no last xchain txn with result",
    #  "data": {
    #   "needsOtherChainLastXChainTxn": true,
    #   "isMainchain": false,
    #   "jlogId": 121
    #  }
    # },


# Lifecycle of a transaction
# For each federator record:
# Transaction detected: amount, seq, destination, chain, hash
# Signature received: hash, seq
# Signature sent: hash, seq, federator dst
# Transaction submitted
# Result received, and detect if error
# Detect any field that doesn't match

# Lifecycle of initialization

# Chain listener messages


def parse_args():
    parser = argparse.ArgumentParser(description=(
        'python script to generate a log report from a sidechain config directory structure containing the logs'
    ))

    parser.add_argument(
        '--input',
        '-i',
        help=('directory with sidechain config directory structure'),
    )

    parser.add_argument(
        '--output',
        '-o',
        help=('output directory for report files'),
    )

    return parser.parse_known_args()[0]


if __name__ == '__main__':
    try:
        args = parse_args()
        main(args.input, args.output)
    except Exception as e:
        eprint(f'Excption: {e}')
        raise e