diff --git a/.github/scripts/otel-naming/README.md b/.github/scripts/otel-naming/README.md index 5f185cec21..b68e661628 100644 --- a/.github/scripts/otel-naming/README.md +++ b/.github/scripts/otel-naming/README.md @@ -33,15 +33,15 @@ hardcoded allowlist: ### Rules (each fails the build, when its inputs are present) -| Rule | Check | -| ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| A | No stray dotted span-attribute key (only the derived resource keys may be dotted). | -| G | Attribute keys are `lower_snake_case` (`^[a-z][a-z0-9_]*$` per dot-segment) — no camelCase, UPPERCASE, or spaces. | -| F | No string literals as attribute keys or span-name arguments in `setAttribute`/`addEvent`/`span`/`childSpan`. Attribute _values_ are exempt (runtime data); `*SpanNames.h` definitions and test files are exempt. | -| B | Every collector `spanmetrics.dimensions` name exists in the L1 key set. | -| C | Every Tempo span-filter tag exists in the L1 key set. | -| D | Every dashboard PromQL label (non-builtin) exists in the L1 key set. | -| E | No dotted `xrpl..` attribute key in the runbook (only the L1 resource attrs `xrpl.network.*` may be dotted). Span names, filenames, OTel-standard keys, and metric labels are not flagged. | +| Rule | Check | +| ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| A | No stray dotted span-attribute key (only the derived resource keys may be dotted). | +| G | Attribute keys are `lower_snake_case` (`^[a-z][a-z0-9_]*$` per dot-segment) — no camelCase, UPPERCASE, or spaces. | +| F | No string literals as attribute keys or span-name arguments in `setAttribute`/`addEvent`/`span`/`childSpan`. Attribute _values_ are exempt (runtime data); `*SpanNames.h` definitions and test files are exempt. | +| B | Every collector `spanmetrics.dimensions` name exists in the L1 key set. | +| C | Every Tempo span-filter tag exists in the L1 key set. | +| D | Every dashboard label resolves to an L1 span attribute, a native-metric label (L6, emitted by MetricsRegistry), or a Prometheus/Grafana builtin. TraceQL scope prefixes (`span.`/`resource.`/…) are stripped before the L1 lookup. | +| E | No dotted `xrpl..` attribute key in the runbook (only the L1 resource attrs `xrpl.network.*` may be dotted). Span names, filenames, OTel-standard keys, and metric labels are not flagged. | Rule F runs **unconditionally** (it is a purely syntactic check on the call-sites and needs no `*SpanNames.h`), so a code path that calls diff --git a/.github/scripts/otel-naming/check_otel_naming.py b/.github/scripts/otel-naming/check_otel_naming.py index 43746d5692..5558eaf95c 100644 --- a/.github/scripts/otel-naming/check_otel_naming.py +++ b/.github/scripts/otel-naming/check_otel_naming.py @@ -36,6 +36,8 @@ Layers L3 tempo : docker/telemetry/tempo.yaml (span filter tags) L4 dashboards: docker/telemetry/grafana/dashboards/*.json (PromQL labels) L5 runbook : docs/telemetry-runbook.md (attr tables) + L6 metrics : MetricsRegistry.cpp instrument labels (native-metric + label keys, a valid dashboard-label source besides L1) Rules (each FAILS the build, when its inputs are present) --------------------------------------------------------- @@ -50,7 +52,9 @@ Rules (each FAILS the build, when its inputs are present) test files are exempt (they pass arbitrary literals to exercise the API). B Every collector spanmetrics dimension exists in the L1 key set. C Every tempo span-filter tag exists in the L1 key set. - D Every dashboard PromQL label (non-builtin) exists in the L1 key set. + D Every dashboard label resolves to an L1 span attribute, an L6 + native-metric label, or a builtin. TraceQL `span.`/`resource.` scope + prefixes are stripped before the L1 lookup. E No dotted `xrpl..` attribute key in the runbook (only the L1 resource attrs xrpl.network.* may be dotted). Span names, filenames, OTel-standard keys, and metric labels are not flagged. @@ -131,6 +135,13 @@ STRING_LITERAL = re.compile(r'"((?:[^"\\]|\\.)*)"') # A C++ line comment (`//` ... end of line) and a block comment (`/* ... */`). LINE_COMMENT = re.compile(r"//[^\n]*") BLOCK_COMMENT = re.compile(r"/\*.*?\*/", re.DOTALL) +# A TraceQL scope prefix on a label (`span.`, `resource.`, `event.`, etc.). +# Dashboards reference span attributes in TraceQL as `span.`; the bare +# attribute is what must exist in L1, so strip the scope before validating. +TRACEQL_SCOPE = re.compile(r"^(?:span|resource|event|link|instrumentation_scope)\.") +# An OTel metric label key as emitted in C++: `Add(.., {{"label", ...}})` / +# `{{"label", value}}` instrument calls in MetricsRegistry. +METRIC_LABEL = re.compile(r'\{\{\s*"([a-z_][a-z0-9_]*)"\s*,') def strip_comments(text: str) -> str: @@ -410,9 +421,14 @@ def main() -> None: run_rule_f(root, report, header_symbols) # --- Cross-layer rules B/C/D/E (each presence-gated) ------------------- + # L6 native-metric labels: span attributes are not the only valid dashboard + # labels — the MetricsRegistry emits OTel metrics whose label keys are an + # additional source of truth. Derive them dynamically (same principle as L1) + # so dashboards may reference them without tripping Rule D. + metric_labels = metric_label_names(root) run_rule_b_collector(root, l1_keys, report) run_rule_c_tempo(root, l1_keys, report) - run_rule_d_dashboards(root, l1_keys, report) + run_rule_d_dashboards(root, l1_keys, metric_labels, report) run_rule_e_runbook(root, l1_keys, report) report.render_and_exit() @@ -689,7 +705,25 @@ def run_rule_c_tempo(root: Path, l1_keys: Set[str], report: Report) -> None: report.ok(f"C: {len(span_tags)} tempo tag(s) all in L1") -def run_rule_d_dashboards(root: Path, l1_keys: Set[str], report: Report) -> None: +def metric_label_names(root: Path) -> Set[str]: + """L6: OTel native-metric label keys emitted by the telemetry code, e.g. + `counter->Add(1, {{"job_type", value}})` in MetricsRegistry.cpp. These are + a valid source of dashboard labels distinct from span attributes (L1).""" + labels: Set[str] = set() + for base in ("src", "include"): + for p in (root / base).rglob("*.cpp"): + if not p.is_file(): + continue + text = read_source(p) + if "MetricsRegistry" not in p.name and "metric" not in text.lower(): + continue + labels |= set(METRIC_LABEL.findall(text)) + return labels + + +def run_rule_d_dashboards( + root: Path, l1_keys: Set[str], metric_labels: Set[str], report: Report +) -> None: dash_dir = root / "docker" / "telemetry" / "grafana" / "dashboards" files = sorted(dash_dir.glob("*.json")) if dash_dir.is_dir() else [] if not files: @@ -710,6 +744,9 @@ def run_rule_d_dashboards(root: Path, l1_keys: Set[str], report: Report) -> None "job", "instance", } + # A dashboard label is valid if it is a span attribute (L1), a native-metric + # label (L6), or a Prometheus/Grafana builtin. + valid = l1_keys | metric_labels | builtins found = False for f in files: try: @@ -720,17 +757,23 @@ def run_rule_d_dashboards(root: Path, l1_keys: Set[str], report: Report) -> None labels: Set[str] = set() for m in re.finditer(r"by\s*\(([^)]*)\)", text): labels |= {x.strip() for x in m.group(1).split(",") if x.strip()} - for m in re.finditer(r"\b([a-z_][a-z0-9_]*)\s*[=!]~?\s*\"", text): + for m in re.finditer(r"\b([a-z_][a-z0-9_.]*)\s*[=!]~?\s*\"", text): labels.add(m.group(1)) for lbl in sorted(labels): - if lbl in builtins or lbl in l1_keys: + # Strip a TraceQL scope prefix (span./resource./...) — the bare + # attribute is what must resolve against L1. + bare = TRACEQL_SCOPE.sub("", lbl) + if bare in valid: continue found = True report.violation( - "D", str(f.relative_to(root)), lbl, "must exist in L1 or builtin" + "D", + str(f.relative_to(root)), + lbl, + "must exist in L1, a metric label, or be a builtin", ) if not found: - report.ok(f"D: dashboard PromQL labels all in L1 ({len(files)} file(s))") + report.ok(f"D: dashboard PromQL labels all resolve ({len(files)} file(s))") def run_rule_e_runbook(root: Path, l1_keys: Set[str], report: Report) -> None: diff --git a/.github/scripts/otel-naming/test_check_otel_naming.py b/.github/scripts/otel-naming/test_check_otel_naming.py index 8254993e7e..7504a62ea0 100644 --- a/.github/scripts/otel-naming/test_check_otel_naming.py +++ b/.github/scripts/otel-naming/test_check_otel_naming.py @@ -477,7 +477,7 @@ class RuleBCollector(unittest.TestCase): class RuleDDashboards(unittest.TestCase): - def _run(self, json_text, l1): + def _run(self, json_text, l1, metric_labels=frozenset()): d = Path(tempfile.mkdtemp()) try: _write( @@ -485,7 +485,7 @@ class RuleDDashboards(unittest.TestCase): json_text, ) report = chk.Report() - chk.run_rule_d_dashboards(d, set(l1), report) + chk.run_rule_d_dashboards(d, set(l1), set(metric_labels), report) return sorted(v[2] for v in report.violations) finally: shutil.rmtree(d) @@ -513,6 +513,72 @@ class RuleDDashboards(unittest.TestCase): def test_l1_label_passes(self): self.assertEqual(self._run('"q": "{command=\\"x\\"}"', {"command"}), []) + def test_traceql_span_prefix_stripped(self): + # `span.establish_count` must validate against the bare L1 key. + self.assertEqual( + self._run( + '"expr": "count_over_time(x) by (span.establish_count)"', + {"establish_count"}, + ), + [], + ) + + def test_traceql_resource_prefix_stripped(self): + self.assertEqual(self._run('"q": "{resource.service_name=\\"x\\"}"', set()), []) + + def test_native_metric_label_passes(self): + # `job_type` / `reason` are emitted by MetricsRegistry, not span attrs. + self.assertEqual( + self._run( + '"expr": "sum by (job_type, reason) (x)"', + {"command"}, + metric_labels={"job_type", "reason"}, + ), + [], + ) + + def test_unknown_label_still_flagged_with_metric_labels(self): + # A label that is neither L1, metric label, nor builtin still fails. + self.assertEqual( + self._run( + '"expr": "sum by (bogus) (x)"', + {"command"}, + metric_labels={"job_type"}, + ), + ["bogus"], + ) + + def test_span_prefixed_unknown_still_flagged(self): + # `span.not_a_key` whose bare form is unknown is still a violation. + self.assertEqual( + self._run('"expr": "x by (span.not_a_key)"', {"command"}), + ["span.not_a_key"], + ) + + +class MetricLabelExtraction(unittest.TestCase): + """L6: native-metric label keys parsed from C++ instrument calls.""" + + def test_extracts_add_label(self): + d = Path(tempfile.mkdtemp()) + try: + _write( + d / "src" / "xrpld" / "telemetry" / "MetricsRegistry.cpp", + 'counter->Add(1, {{"job_type", std::string(jobType)}});\n' + 'c2->Add(1, {{"reason", std::string(r)}});\n', + ) + self.assertEqual(chk.metric_label_names(d), {"job_type", "reason"}) + finally: + shutil.rmtree(d) + + def test_no_metrics_file_empty(self): + d = Path(tempfile.mkdtemp()) + try: + (d / "src").mkdir() + self.assertEqual(chk.metric_label_names(d), set()) + finally: + shutil.rmtree(d) + class ReportExitContract(unittest.TestCase): @staticmethod