diff --git a/README.md b/README.md index 7c8dec6..41b0ccd 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ Facebook has adopted a Code of Conduct that we expect project participants to ad ## The Team -GPU Cluster Monitoring is actively maintained by [Lucca Bertoncini](https://github.com/luccabb), [Caleb Ho](https://github.com/calebho), [Apostolos Kokolis](https://github.com/A-Kokolis), [Liao Hu](https://github.com/L1A0), [Thanh Nguyen](https://github.com/giongto35), [Billy Campoli](https://github.com/tooji) with a number of contributions coming from talented individuals (in no particular order, and non-exhaustive): [Jörg Doku](https://github.com/Jorghi12), [Vivian Peng](https://github.com/vzpeng), [Parth Malani](https://github.com/pmmalani), [Kalyan Saladi](https://github.com/skalyan), [Shubho Sengupta](https://github.com/shubho), [Leo Huang](https://github.com/lifeihuang), [Robert Vincent](https://github.com/bvincent-penguin), [Max Wang](https://github.com/mxw), [Sujit Verma](https://github.com/sujitoc), [Teng Li](https://github.com/teng-li), [James Taylor](https://github.com/jamestaylr), [Xiaodong Ma](https://github.com/xman1979), [Chris Henry](https://github.com/chenry3), [Jakob Johnson](https://github.com/jj10306), [Kareem Sakher](https://github.com/kjsakher), [Abinesh Ramakrishnan](https://github.com/ibanesh), [Nabib Ahmed](https://github.com/nahmed3536), [Yong Li](https://github.com/yonglimeta), [Junjie Qian](https://github.com/junjieqian), [David Watson](https://github.com/davidewatson), [Guanyu Wu](https://github.com/kwu-penguin), [Jaromir Latal](https://github.com/jermenkoo), [Samuel Doud](https://github.com/SamuelDoud), [Yidi Wu](https://github.com/ydwu4), [Xinyuan Zhang](https://github.com/xinyuanzzz), [Neha Saxena](https://github.com/nehasaxena210), [Gustavo Lima](https://github.com/gustcol). +GPU Cluster Monitoring is actively maintained by [Lucca Bertoncini](https://github.com/luccabb), [Caleb Ho](https://github.com/calebho), [Apostolos Kokolis](https://github.com/A-Kokolis), [Liao Hu](https://github.com/L1A0), [Thanh Nguyen](https://github.com/giongto35), [Billy Campoli](https://github.com/tooji) with a number of contributions coming from talented individuals (in no particular order, and non-exhaustive): [Jörg Doku](https://github.com/Jorghi12), [Vivian Peng](https://github.com/vzpeng), [Parth Malani](https://github.com/pmmalani), [Kalyan Saladi](https://github.com/skalyan), [Shubho Sengupta](https://github.com/shubho), [Leo Huang](https://github.com/lifeihuang), [Robert Vincent](https://github.com/bvincent-penguin), [Max Wang](https://github.com/mxw), [Sujit Verma](https://github.com/sujitoc), [Teng Li](https://github.com/teng-li), [James Taylor](https://github.com/jamestaylr), [Xiaodong Ma](https://github.com/xman1979), [Chris Henry](https://github.com/chenry3), [Jakob Johnson](https://github.com/jj10306), [Kareem Sakher](https://github.com/kjsakher), [Abinesh Ramakrishnan](https://github.com/ibanesh), [Nabib Ahmed](https://github.com/nahmed3536), [Yong Li](https://github.com/yonglimeta), [Junjie Qian](https://github.com/junjieqian), [David Watson](https://github.com/davidewatson), [Guanyu Wu](https://github.com/kwu-penguin), [Jaromir Latal](https://github.com/jermenkoo), [Samuel Doud](https://github.com/SamuelDoud), [Yidi Wu](https://github.com/ydwu4), [Xinyuan Zhang](https://github.com/xinyuanzzz), [Neha Saxena](https://github.com/nehasaxena210), [Gustavo Lima](https://github.com/gustcol), [Achintya Paningapalli](https://github.com/theap06). Feel free to contribute and add your name! diff --git a/gcm/exporters/file.py b/gcm/exporters/file.py index 3108046..a0898fc 100644 --- a/gcm/exporters/file.py +++ b/gcm/exporters/file.py @@ -1,24 +1,38 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +from __future__ import annotations + +import csv +import io import json +import logging import os from dataclasses import asdict -from typing import Callable, Optional, Tuple +from typing import Any, Callable, cast, Dict, Literal, Optional, Tuple, TYPE_CHECKING from gcm.exporters import register - +from gcm.monitoring.dataclass_utils import asdict_recursive from gcm.monitoring.meta_utils.scuba import to_scuba_message from gcm.monitoring.sink.protocol import DataIdentifier, SinkAdditionalParams - from gcm.monitoring.utils.monitor import init_logger from gcm.schemas.log import Log +if TYPE_CHECKING: + from _typeshed import DataclassInstance + split_path: Callable[[str], Tuple[str, str]] = lambda path: ( os.path.dirname(path), os.path.basename(path), ) +def _flatten_for_csv(payload: object) -> Dict[str, Any]: + """Flatten scuba message dict for CSV output.""" + scuba = asdict(to_scuba_message(cast("DataclassInstance", payload))) + flat = asdict_recursive(scuba) + return flat if isinstance(flat, dict) else {} + + @register("file") class File: """Write data to file.""" @@ -29,16 +43,24 @@ def __init__( file_path: Optional[str] = None, job_file_path: Optional[str] = None, node_file_path: Optional[str] = None, + format: Literal["json", "csv"] = "json", ): if all(path is None for path in [file_path, job_file_path, node_file_path]): raise Exception( "When using the file sink at least one file_path needs to be specified. See gcm %collector% --help" ) - self.data_identifier_to_logger_map = {} + self.format = format + self.data_identifier_to_logger_map: Dict[ + DataIdentifier, Optional[logging.Logger] + ] = {} + self._data_identifier_to_path: Dict[DataIdentifier, str] = {} + if self.format == "csv": + self._csv_header_written: Dict[str, bool] = {} if file_path is not None: file_directory, file_name = split_path(file_path) + self._data_identifier_to_path[DataIdentifier.GENERIC] = file_path self.data_identifier_to_logger_map[DataIdentifier.GENERIC], _ = init_logger( logger_name=__name__ + file_path, log_dir=file_directory, @@ -48,6 +70,7 @@ def __init__( if job_file_path is not None: file_directory, file_name = split_path(job_file_path) + self._data_identifier_to_path[DataIdentifier.JOB] = job_file_path self.data_identifier_to_logger_map[DataIdentifier.JOB], _ = init_logger( logger_name=__name__ + job_file_path, log_dir=file_directory, @@ -57,6 +80,7 @@ def __init__( if node_file_path is not None: file_directory, file_name = split_path(node_file_path) + self._data_identifier_to_path[DataIdentifier.NODE] = node_file_path self.data_identifier_to_logger_map[DataIdentifier.NODE], _ = init_logger( logger_name=__name__ + node_file_path, log_dir=file_directory, @@ -70,24 +94,58 @@ def write( additional_params: SinkAdditionalParams, ) -> None: - # update file path if data_identifier is present on additional_params - if additional_params.data_identifier: - data_identifier = additional_params.data_identifier - if data_identifier not in self.data_identifier_to_logger_map: + data_identifier = additional_params.data_identifier or DataIdentifier.GENERIC + if data_identifier not in self.data_identifier_to_logger_map: + raise AssertionError( + f"data_identifier value is unsupported on file sink: {data_identifier}" + ) + if self.data_identifier_to_logger_map[data_identifier] is None: + raise AssertionError( + f"The sink is missing a required param for the following data_identifier: {data_identifier}. See gcm %collector% --help" + ) + logger = self.data_identifier_to_logger_map[data_identifier] + assert logger is not None + + if self.format == "csv": + path = self._data_identifier_to_path.get(data_identifier) + if path is None: raise AssertionError( - f"data_identifier value is unsupported on file sink: {data_identifier}" + "CSV format requires data_identifier to match a configured path" ) - if self.data_identifier_to_logger_map[data_identifier] is None: - raise AssertionError( - f"The sink is missing a required param for the following data_identifier: {data_identifier}. See gcm %collector% --help" + records = [_flatten_for_csv(p) for p in data.message] + if not records: + return + all_keys = sorted({k for r in records for k in r.keys()}) + header_done = self._csv_header_written.get(path, False) + + if not header_done: + header_buf = io.StringIO() + header_writer = csv.DictWriter( + header_buf, + fieldnames=all_keys, + extrasaction="ignore", + lineterminator="", ) - logger = self.data_identifier_to_logger_map[data_identifier] - else: - logger = self.data_identifier_to_logger_map[DataIdentifier.GENERIC] + header_writer.writeheader() + logger.info(header_buf.getvalue()) + self._csv_header_written[path] = True - for payload in data.message: - # TODO: remove to_scuba_message once slurm_job_monitor migrates to OpenTelemetry exporter - logger.info(json.dumps(asdict(to_scuba_message(payload)))) + for record in records: + row_buf = io.StringIO() + row_writer = csv.DictWriter( + row_buf, + fieldnames=all_keys, + extrasaction="ignore", + lineterminator="", + ) + row_writer.writerow(record) + logger.info(row_buf.getvalue()) + elif self.format == "json": + for payload in data.message: + # TODO: remove to_scuba_message once slurm_job_monitor migrates to OpenTelemetry exporter + logger.info(json.dumps(asdict(to_scuba_message(payload)))) + else: + raise ValueError(f"Unsupported format: {self.format!r}") def shutdown(self) -> None: pass diff --git a/gcm/tests/test_file_exporter.py b/gcm/tests/test_file_exporter.py new file mode 100644 index 0000000..bac6668 --- /dev/null +++ b/gcm/tests/test_file_exporter.py @@ -0,0 +1,36 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +import csv +from dataclasses import dataclass +from pathlib import Path + +from gcm.exporters.file import File +from gcm.monitoring.sink.protocol import DataIdentifier, SinkAdditionalParams +from gcm.schemas.log import Log + + +@dataclass +class SamplePayload: + job_id: int + state: str + user: str + + +def test_file_exporter_csv(tmp_path: Path) -> None: + path = tmp_path / "data.csv" + sink = File(file_path=str(path), format="csv") + sink.write( + Log(ts=1000, message=[SamplePayload(job_id=1, state="RUNNING", user="alice")]), + SinkAdditionalParams(data_identifier=DataIdentifier.GENERIC), + ) + sink.write( + Log(ts=2000, message=[SamplePayload(job_id=2, state="PENDING", user="bob")]), + SinkAdditionalParams(data_identifier=DataIdentifier.GENERIC), + ) + with open(path) as f: + rows = list(csv.DictReader(f)) + assert len(rows) == 2 + assert rows[0]["normal.state"] == "RUNNING" + assert rows[0]["normal.user"] == "alice" + assert rows[1]["normal.state"] == "PENDING" + assert rows[1]["normal.user"] == "bob" diff --git a/website/docs/GCM_Health_Checks/exporters/file.md b/website/docs/GCM_Health_Checks/exporters/file.md index 8a8e122..38e7c87 100644 --- a/website/docs/GCM_Health_Checks/exporters/file.md +++ b/website/docs/GCM_Health_Checks/exporters/file.md @@ -13,6 +13,7 @@ The File exporter writes monitoring data and health check results to local file | Option | Required | Description | |--------|----------|-------------| | `file_path` | Yes | Path to the output file | +| `format` | No | `json` (default) or `csv` | ### Basic Usage diff --git a/website/docs/GCM_Monitoring/exporters/file.md b/website/docs/GCM_Monitoring/exporters/file.md index 8a8e122..99f7ae7 100644 --- a/website/docs/GCM_Monitoring/exporters/file.md +++ b/website/docs/GCM_Monitoring/exporters/file.md @@ -13,6 +13,7 @@ The File exporter writes monitoring data and health check results to local file | Option | Required | Description | |--------|----------|-------------| | `file_path` | Yes | Path to the output file | +| `format` | No | `json` (default) or `csv` | ### Basic Usage @@ -36,6 +37,8 @@ sink_opts = [ ## Output Format +### JSON (default) + Data is written as newline-delimited JSON (NDJSON), with each monitoring event on a separate line: ```json @@ -48,6 +51,16 @@ This format allows for: - Line-by-line processing with standard tools - Easy parsing with JSON libraries +### CSV + +Use `format=csv` for comma-separated output suitable for spreadsheets and offline analysis: + +```shell +gcm slurm_monitor --sink=file --sink-opt file_path=/var/log/gcm/data.csv --sink-opt format=csv --once +``` + +The first write adds a header row; subsequent writes append data rows. + ## Use Cases ### Production Monitoring