verl-project · tardis-key · Mar 26, 2026 · Mar 20, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/.github/workflows/check-pr-title.yml → .github/workflows/check_pr_title.yml b/.github/workflows/check-pr-title.yml → .github/workflows/check_pr_title.yml
@@ -27,13 +27,14 @@
 #     - new workflow yaml is added to `.github/workflows`
 #     - new tests are added to workflow mentioned in 2.
 
+name: check_pr_title
 
 on:
   pull_request:
     types: [opened, edited, synchronize]
 
 jobs:
-  check-title:
+  check_title:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code

diff --git a/.github/workflows/pre-commit.yml → .github/workflows/pre_commit.yml b/.github/workflows/pre-commit.yml → .github/workflows/pre_commit.yml
@@ -1,5 +1,5 @@
 # c.f. https://github.com/pre-commit/action?tab=readme-ov-file#using-this-action
-name: pre-commit
+name: pre_commit
 
 # No need to avoid / cancel lightweight pre-commit jobs
 on:
@@ -18,7 +18,7 @@ permissions:
   contents: read
 
 jobs:
-  pre-commit:
+  pre_commit:
     runs-on: ubuntu-latest
     strategy:
       matrix:

diff --git a/.github/workflows/special_e2e.yml b/.github/workflows/special_e2e.yml
@@ -1,4 +1,4 @@
-name: profiling_data_analysis_st
+name: special_e2e
 
 on:
   push:
@@ -22,7 +22,7 @@ permissions:
   contents: read
 
 jobs:
-  profiling_data_analysis_st:
+  special_e2e:
     runs-on: ubuntu-latest
     timeout-minutes: 5
     strategy:
@@ -42,6 +42,6 @@ jobs:
           pip install -r requirements.txt
           pip install -e .
 
-      - name: Run profiling_data_analysis_st tests
+      - name: Run rl-insight e2e tests
         run: |
           pytest -s -x tests/special_e2e
diff --git a/.github/workflows/cluster_analysis.yml → .github/workflows/unit_test.yml b/.github/workflows/cluster_analysis.yml → .github/workflows/unit_test.yml
@@ -1,4 +1,4 @@
-name: cluster_analyse
+name: unit_test
 
 on:
   push:
@@ -11,8 +11,8 @@ on:
       - v0.*
     paths:
       - "**/*.py"
-      - .github/workflows/cluster_analysis.yml
-      - "tests/cluster_analysis/**"
+      - .github/workflows/unit_test.yml
+      - "tests/**"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -22,7 +22,7 @@ permissions:
   contents: read
 
 jobs:
-  cluster_analyse:
+  unit_test:
     runs-on: ubuntu-latest
     timeout-minutes: 5
     strategy:
@@ -42,6 +42,10 @@ jobs:
           pip install -r requirements.txt
           pip install -e .
 
-      - name: Run cluster_analyse tests
+      - name: Run parser tests
         run: |
-          pytest -s -x tests/cluster_analysis
+          pytest -s -x tests/parser
+
+      - name: Run data_checker tests
+        run: |
+          pytest -s -x tests/data
diff --git a/data/base.py b/data/base.py
diff --git a/data/multi_json.py b/data/multi_json.py
diff --git a/data/summary_event.py b/data/summary_event.py
diff --git a/data/verl_log.py b/data/verl_log.py
diff --git a/docs/cluster_analysis.md b/docs/cluster_analysis.md
@@ -17,6 +17,7 @@ RL-Insight 是一个强化学习性能数据快速分析的可视化工具，基
 - Pandas
 - Plotly
 - NumPy
+- Loguru
 
 ## 二、快速使用
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
   "pandas",
   "plotly",
   "pytest",
+  "loguru"
 ]
 
 [project.urls]

diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@
 numpy<2.0.0
 pandas
 plotly
-pytest
+pytest
+loguru
diff --git a/tests/cluster_analysis/__init__.py → rl_insight/data/__init__.py b/tests/cluster_analysis/__init__.py → rl_insight/data/__init__.py
@@ -11,3 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+"""Data module for RL-Insight."""
+
+from .data_checker import DataChecker, DataEnum
+
+__all__ = [
+    "DataChecker",
+    "DataEnum",
+]
diff --git a/rl_insight/data/data_checker.py b/rl_insight/data/data_checker.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025 verl-project authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base data definitions for RL-Insight."""
+
+from typing import Any, List
+from .rules import ValidationRule, PathExistsRule, DataValidationError
+from enum import Enum
+from loguru import logger
+
+
+class DataEnum(Enum):
+    """Enum for data types in RL-Insight."""
+
+    # input data type of parser
+    MULTI_JSON = "multi_json"
+    VERL_LOG = "verl_log"
+    # output data type of parser, input data type of visualizer
+    SUMMARY_EVENT = "summary_event"
+    # other data type
+    UNKNOWN = "unknown"
+
+
+class DataChecker:
+    """Base data class for RL-Insight."""
+
+    rules: dict[DataEnum, List[ValidationRule]] = {
+        DataEnum.MULTI_JSON: [PathExistsRule()],
+        DataEnum.VERL_LOG: [],
+        DataEnum.SUMMARY_EVENT: [],
+        DataEnum.UNKNOWN: [],
+    }
+
+    def __init__(self, data_type: DataEnum, data: Any):
+        self.data_type = data_type
+        self.data = data
+
+    def run(self):
+        """Validate the data"""
+        errors = []
+        if self.data_type not in self.rules:
+            raise ValueError(f"Invalid data type: {self.data_type}")
+        rules = self.rules[self.data_type]
+        for rule in rules:
+            if not rule.check(self.data):
+                errors.append(rule.error_message)
+        if errors:
+            raise DataValidationError("Data validation failed", errors)
+        logger.info(f"Data validation passed for {self.data_type}")
diff --git a/rl_insight/data/rules.py b/rl_insight/data/rules.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2025 verl-project authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Any
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Optional
+
+
+class DataValidationError(Exception):
+    """Exception raised when data validation fails."""
+
+    def __init__(self, message: str, errors: Optional[List[str]] = None):
+        super().__init__(message)
+        self.errors = errors or []
+
+    def __str__(self) -> str:
+        if self.errors:
+            return f"{super().__str__()}\n  - " + "\n  - ".join(self.errors)
+        return super().__str__()
+
+
+class ValidationRule(ABC):
+    """Validation rule base class"""
+
+    def __init__(self):
+        self._error_message: str = ""
+
+    @abstractmethod
+    def check(self, data) -> bool:
+        pass
+
+    @property
+    def error_message(self) -> str:
+        return self._error_message
+
+
+class PathExistsRule(ValidationRule):
+    def check(self, data: Any) -> bool:
+        if not isinstance(data, str):
+            self._error_message = "Data object is not a path"
+            return False
+        try:
+            path = Path(data)
+            if not path.is_dir():
+                self._error_message = (
+                    f"Source path is not a directory or does not exist: {data}"
+                )
+                return False
+            return True
+        except TypeError as e:
+            self._error_message = f"Error checking path {data}: {e}"
+            return False
diff --git a/rl_insight/main.py b/rl_insight/main.py
@@ -29,12 +29,17 @@ def run_pipeline(config, pipeline_class=None):
 def main():
     arg_parser = argparse.ArgumentParser(description="Cluster scheduling visualization")
     arg_parser.add_argument(
-        "--input-path", default="test", help="Raw path of profiling data"
+        "--input-path", required=True, help="Raw path of profiling data"
+    )
+    arg_parser.add_argument(
+        "--input-type",
+        default="multi_json",
+        help="Input data type. Supported: 'multi_json' (for nvtx/mstx/torch_profile from different directories).",
     )
     arg_parser.add_argument(
         "--profiler-type", default="mstx", help="Profiler type, supported mstx/nvtx"
     )
-    arg_parser.add_argument("--output-path", default="test", help="Output path")
+    arg_parser.add_argument("--output-path", default="output", help="Output path")
     arg_parser.add_argument(
         "--vis-type", default="html", help="Visualization type, supported html"
     )

diff --git a/rl_insight/parser/mstx_parser.py b/rl_insight/parser/mstx_parser.py
@@ -13,21 +13,14 @@
 # limitations under the License.
 
 import json
-import logging
+from loguru import logger
 import os
 from collections import defaultdict
 from pathlib import Path
 
 from .parser import BaseClusterParser, register_cluster_parser
 from rl_insight.utils.schema import Constant, DataMap, EventRow
 
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-    handlers=[logging.StreamHandler()],
-)
-logger = logging.getLogger(__name__)
-
 
 @register_cluster_parser("mstx")
 class MstxClusterParser(BaseClusterParser):
-Original file line number
+Diff line change
@@ Expand Up @@
     - Pandas
     - Plotly
     - NumPy
+    - Loguru
     ## 二、快速使用
@@ Expand Down @@