Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions gcm/exporters/telemetry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
"""Structured telemetry export (JSON/CSV) for offline analysis."""

from __future__ import annotations

import csv
import json
import os
from dataclasses import asdict
from datetime import datetime
from typing import cast, Literal, TYPE_CHECKING

from gcm.exporters import register

if TYPE_CHECKING:
from _typeshed import DataclassInstance
from gcm.monitoring.dataclass_utils import remove_none_dict_factory
from gcm.monitoring.sink.protocol import SinkAdditionalParams
from gcm.schemas.log import Log


def _snapshot(ts: int, msg: object) -> dict:
d = asdict(
cast("DataclassInstance", msg),
dict_factory=remove_none_dict_factory,
)
d["timestamp"] = datetime.utcfromtimestamp(ts).strftime("%Y-%m-%dT%H:%M:%S")
return d


@register("telemetry")
class Telemetry:
"""Append telemetry snapshots to a file in JSON or CSV format."""

def __init__(
self,
*,
file_path: str,
format: Literal["json", "csv"] = "json",
) -> None:
self.file_path = file_path
self.format = format
self._header_written = False

def write(
self,
data: Log,
additional_params: SinkAdditionalParams,
) -> None:
records = [_snapshot(data.ts, m) for m in data.message]
if not records:
return
os.makedirs(os.path.dirname(self.file_path) or ".", exist_ok=True)
with open(self.file_path, "a") as f:
if self.format == "json":
for r in records:
f.write(json.dumps(r) + "\n")
else:
all_keys = ["timestamp"] + sorted(
{k for r in records for k in r.keys()} - {"timestamp"}
)
w = csv.DictWriter(f, fieldnames=all_keys, extrasaction="ignore")
if not self._header_written:
w.writeheader()
self._header_written = True
w.writerows(records)
85 changes: 85 additions & 0 deletions gcm/tests/test_telemetry_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
import csv
import json
from pathlib import Path

from gcm.exporters.telemetry import Telemetry
from gcm.monitoring.sink.protocol import DataType, SinkAdditionalParams
from gcm.schemas.device_metrics import DevicePlusJobMetrics
from gcm.schemas.log import Log


def test_telemetry_json(tmp_path: Path) -> None:
path = tmp_path / "telemetry.json"
sink = Telemetry(file_path=str(path), format="json")
msg = DevicePlusJobMetrics(
gpu_id=3,
hostname="node-42",
job_id=91283,
job_user="research_team",
gpu_util=88,
mem_used_percent=90,
temperature=78,
power_draw=310,
retired_pages_count_single_bit=0,
retired_pages_count_double_bit=0,
)
sink.write(
Log(ts=1741114282, message=[msg]),
SinkAdditionalParams(data_type=DataType.LOG),
)
lines = path.read_text().strip().split("\n")
assert len(lines) == 1
data = json.loads(lines[0])
assert data["timestamp"] == "2025-03-04T18:51:22" # UTC for ts=1741114282
assert data["hostname"] == "node-42"
assert data["gpu_id"] == 3
assert data["job_id"] == 91283
assert data["job_user"] == "research_team"
assert data["gpu_util"] == 88
assert data["temperature"] == 78
assert data["power_draw"] == 310


def test_telemetry_csv(tmp_path: Path) -> None:
path = tmp_path / "telemetry.csv"
sink = Telemetry(file_path=str(path), format="csv")
msg = DevicePlusJobMetrics(
gpu_id=0,
hostname="node-1",
job_id=100,
job_user="user",
gpu_util=50,
temperature=65,
power_draw=200,
)
sink.write(
Log(ts=1741114282, message=[msg]),
SinkAdditionalParams(data_type=DataType.LOG),
)
with open(path) as f:
reader = csv.DictReader(f)
rows = list(reader)
assert len(rows) == 1
assert rows[0]["timestamp"] == "2025-03-04T18:51:22" # UTC for ts=1741114282
assert rows[0]["hostname"] == "node-1"
assert rows[0]["gpu_id"] == "0"
assert rows[0]["gpu_util"] == "50"


def test_telemetry_csv_append(tmp_path: Path) -> None:
path = tmp_path / "telemetry.csv"
sink = Telemetry(file_path=str(path), format="csv")
msg = DevicePlusJobMetrics(gpu_id=0, hostname="n1", gpu_util=10)
sink.write(
Log(ts=1000, message=[msg]), SinkAdditionalParams(data_type=DataType.LOG)
)
sink.write(
Log(ts=2000, message=[msg]), SinkAdditionalParams(data_type=DataType.LOG)
)
with open(path) as f:
rows = list(csv.DictReader(f))
assert len(rows) == 2
assert rows[0]["timestamp"] == "1970-01-01T00:16:40"
assert rows[1]["timestamp"] == "1970-01-01T00:33:20"
1 change: 1 addition & 0 deletions website/docs/GCM_Health_Checks/exporters/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ GCM includes several built-in exporters for different use cases:
| [Graph API](graph_api.md) | `graph_api` | Meta's internal backends |
| [OpenTelemetry](otel.md) | `otel` | OTLP-compatible backends |
| [Stdout](stdout.md) | `stdout` | Terminal output |
| [Telemetry](telemetry.md) | `telemetry` | Structured JSON/CSV for offline analysis |
| [Webhook](webhook.md) | `webhook` | HTTP endpoint forwarding |

## Plugin System
Expand Down
7 changes: 7 additions & 0 deletions website/docs/GCM_Health_Checks/exporters/telemetry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
sidebar_position: 7
---

# Telemetry

The Telemetry exporter appends structured telemetry snapshots to a local file in JSON or CSV format for offline analysis. See [GCM Monitoring Telemetry exporter](../../GCM_Monitoring/exporters/telemetry.md) for full documentation.
1 change: 1 addition & 0 deletions website/docs/GCM_Monitoring/exporters/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ GCM includes several built-in exporters for different use cases:
| [Graph API](graph_api.md) | `graph_api` | Meta's internal backends |
| [OpenTelemetry](otel.md) | `otel` | OTLP-compatible backends |
| [Stdout](stdout.md) | `stdout` | Terminal output |
| [Telemetry](telemetry.md) | `telemetry` | Structured JSON/CSV for offline analysis |
| [Webhook](webhook.md) | `webhook` | HTTP endpoint forwarding |

## Plugin System
Expand Down
32 changes: 32 additions & 0 deletions website/docs/GCM_Monitoring/exporters/telemetry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
sidebar_position: 7
---

# Telemetry

The Telemetry exporter appends structured telemetry snapshots to a local file in JSON or CSV format for offline analysis.

## Configuration

| Option | Required | Description |
|--------|----------|-------------|
| `file_path` | Yes | Path to the output file |
| `format` | No | `json` (default) or `csv` |

## Usage

```shell
# JSON (NDJSON, one object per line)
gcm nvml_monitor --sink=telemetry --sink-opt file_path=/var/log/gcm/telemetry.json --once

# CSV
gcm nvml_monitor --sink=telemetry --sink-opt file_path=/var/log/gcm/telemetry.csv --sink-opt format=csv --once
```

## Output

Each snapshot adds a timestamp and writes one record per GPU. Example JSON:

```json
{"timestamp": "2026-03-04T21:31:22", "hostname": "node-42", "gpu_id": 3, "job_id": 91283, "job_user": "research_team", "gpu_util": 88, "mem_used_percent": 71, "temperature": 78, "power_draw": 310, "retired_pages_count_single_bit": 0, "retired_pages_count_double_bit": 0}
```
Loading