diff --git a/.github/workflows/cluster_analysis.yml b/.github/workflows/cluster_analysis.yml index 7eade20..6845171 100644 --- a/.github/workflows/cluster_analysis.yml +++ b/.github/workflows/cluster_analysis.yml @@ -44,4 +44,4 @@ jobs: - name: Run cluster_analyse tests run: | - pytest -s -x tests/cluster_analysis \ No newline at end of file + pytest -s -x tests/cluster_analysis diff --git a/.github/workflows/special_e2e.yml b/.github/workflows/special_e2e.yml new file mode 100644 index 0000000..28cb80e --- /dev/null +++ b/.github/workflows/special_e2e.yml @@ -0,0 +1,47 @@ +name: profiling_data_analysis_st + +on: + push: + branches: + - main + - v0.* + pull_request: + branches: + - main + - v0.* + paths: + - "**/*.py" + - .github/workflows/special_e2e.yml + - "tests/special_e2e/**" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + +permissions: + contents: read + +jobs: + profiling_data_analysis_st: + runs-on: ubuntu-latest + timeout-minutes: 5 + strategy: + matrix: + python-version: ["3.11"] + steps: + - name: Checkout code + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + pip install -r requirements.txt + pip install -e . + + - name: Run profiling_data_analysis_st tests + run: | + pytest -s -x tests/special_e2e diff --git a/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json b/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json new file mode 100644 index 0000000..e9b7563 --- /dev/null +++ b/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json @@ -0,0 +1,101 @@ +[ + { + "name": "process_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Overlap Analysis" + } + }, + { + "name": "process_labels", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "labels": "NPU 0" + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Communication" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 0 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 1, + "ph": "M", + "args": { + "name": "Communication(Not Overlapped)" + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 2, + "ph": "M", + "args": { + "name": "Computing" + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 3, + "ph": "M", + "args": { + "name": "Free" + } + }, + { + "name": "Computing", + "pid": 3550586784, + "tid": 2, + "ts": "1773285899055563.748", + "dur": 53.301, + "ph": "X", + "args": {} + }, + { + "name": "Communication", + "pid": 3550586784, + "tid": 0, + "ts": "1773285899057336.304", + "dur": 52269.524, + "ph": "X", + "args": {} + }, + { + "name": "Communication(Not Overlapped)", + "pid": 3550586784, + "tid": 1, + "ts": "1773285899057336.304", + "dur": 52269.524, + "ph": "X", + "args": {} + }, + { + "name": "Free", + "pid": 3550586784, + "tid": 3, + "ts": "1773285899049161.720", + "dur": 6402.028, + "ph": "X", + "args": {} + } +] \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/profiler_info_0.json b/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/profiler_info_0.json new file mode 100644 index 0000000..9b8c832 --- /dev/null +++ b/data/mstx_data/mstx_profile/actor_compute_log_prob/xxx_ascend_pt/profiler_info_0.json @@ -0,0 +1,50 @@ +{ + "config": { + "common_config": { + "activities": [ + "ProfilerActivity.CPU", + "ProfilerActivity.NPU" + ], + "schedule": {}, + "record_shapes": false, + "profile_memory": false, + "with_stack": false, + "with_flops": false, + "with_modules": false + }, + "experimental_config": { + "_profiler_level": "Level1", + "_aic_metrics": "ACL_AICORE_PIPE_UTILIZATION", + "_l2_cache": false, + "_msprof_tx": true, + "_mstx": false, + "_data_simplification": true, + "record_op_args": false, + "_export_type": [ + "db" + ], + "_host_sys": [], + "_op_attr": false, + "_gc_detect_threshold": null, + "_mstx_domain_include": [], + "_mstx_domain_exclude": [ + "communication" + ], + "_sys_io": false, + "_sys_interconnection": false + } + }, + "start_info": { + "syscnt_enable": true, + "freq": 100, + "start_cnt": 238468094663174, + "start_monotonic": 2384513312565430 + }, + "end_info": { + "collectionTimeEnd": 1773285903061065750, + "MonotonicTimeEnd": 2384517398234200 + }, + "torch_npu_version": "2.7.1", + "cann_version": "8.3.RC1", + "rank_id": "0" +} \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json b/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json new file mode 100644 index 0000000..435c18a --- /dev/null +++ b/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json @@ -0,0 +1,137 @@ +[ + { + "name": "process_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Overlap Analysis" + } + }, + { + "name": "process_labels", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "labels": "NPU 0" + } + }, + { + "name": "process_sort_index", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 29 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Communication" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 0 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 1, + "ph": "M", + "args": { + "name": "Communication(Not Overlapped)" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 1, + "ph": "M", + "args": { + "sort_index": 1 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 2, + "ph": "M", + "args": { + "name": "Computing" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 2, + "ph": "M", + "args": { + "sort_index": 2 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 3, + "ph": "M", + "args": { + "name": "Free" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 3, + "ph": "M", + "args": { + "sort_index": 3 + } + }, + { + "name": "Computing", + "pid": 3550586784, + "tid": 2, + "ts": "1773285904640626.314", + "dur": 145.623, + "ph": "X", + "args": {} + }, + { + "name": "Communication", + "pid": 3550586784, + "tid": 0, + "ts": "1773285904641909.419", + "dur": 12782.896, + "ph": "X", + "args": {} + }, + { + "name": "Communication(Not Overlapped)", + "pid": 3550586784, + "tid": 1, + "ts": "1773285904641909.419", + "dur": 12782.896, + "ph": "X", + "args": {} + }, + { + "name": "Free", + "pid": 3550586784, + "tid": 3, + "ts": "1773285904639877.299", + "dur": 749.015, + "ph": "X", + "args": {} + } +] \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/profiler_info_0.json b/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/profiler_info_0.json new file mode 100644 index 0000000..cb3649a --- /dev/null +++ b/data/mstx_data/mstx_profile/actor_update/xxx_ascend_pt/profiler_info_0.json @@ -0,0 +1,50 @@ +{ + "config": { + "common_config": { + "activities": [ + "ProfilerActivity.CPU", + "ProfilerActivity.NPU" + ], + "schedule": {}, + "record_shapes": false, + "profile_memory": false, + "with_stack": false, + "with_flops": false, + "with_modules": false + }, + "experimental_config": { + "_profiler_level": "Level1", + "_aic_metrics": "ACL_AICORE_PIPE_UTILIZATION", + "_l2_cache": false, + "_msprof_tx": true, + "_mstx": false, + "_data_simplification": true, + "record_op_args": false, + "_export_type": [ + "db" + ], + "_host_sys": [], + "_op_attr": false, + "_gc_detect_threshold": null, + "_mstx_domain_include": [], + "_mstx_domain_exclude": [ + "communication" + ], + "_sys_io": false, + "_sys_interconnection": false + } + }, + "start_info": { + "syscnt_enable": true, + "freq": 100, + "start_cnt": 238468656388594, + "start_monotonic": 2384518929819120 + }, + "end_info": { + "collectionTimeEnd": 1773285907402740580, + "MonotonicTimeEnd": 2384521739905940 + }, + "torch_npu_version": "2.7.1", + "cann_version": "8.3.RC1", + "rank_id": "0" +} \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json b/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json new file mode 100644 index 0000000..9beb8b3 --- /dev/null +++ b/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json @@ -0,0 +1,155 @@ +[ + { + "name": "process_name", + "pid": 3555714976, + "tid": 0, + "ph": "M", + "args": { + "name": "Overlap Analysis" + } + }, + { + "name": "process_labels", + "pid": 3555714976, + "tid": 0, + "ph": "M", + "args": { + "labels": "NPU 0" + } + }, + { + "name": "process_sort_index", + "pid": 3555714976, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 29 + } + }, + { + "name": "thread_name", + "pid": 3555714976, + "tid": 0, + "ph": "M", + "args": { + "name": "Communication" + } + }, + { + "name": "thread_sort_index", + "pid": 3555714976, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 0 + } + }, + { + "name": "thread_name", + "pid": 3555714976, + "tid": 1, + "ph": "M", + "args": { + "name": "Communication(Not Overlapped)" + } + }, + { + "name": "thread_sort_index", + "pid": 3555714976, + "tid": 1, + "ph": "M", + "args": { + "sort_index": 1 + } + }, + { + "name": "thread_name", + "pid": 3555714976, + "tid": 2, + "ph": "M", + "args": { + "name": "Computing" + } + }, + { + "name": "thread_sort_index", + "pid": 3555714976, + "tid": 2, + "ph": "M", + "args": { + "sort_index": 2 + } + }, + { + "name": "thread_name", + "pid": 3555714976, + "tid": 3, + "ph": "M", + "args": { + "name": "Free" + } + }, + { + "name": "thread_sort_index", + "pid": 3555714976, + "tid": 3, + "ph": "M", + "args": { + "sort_index": 3 + } + }, + { + "name": "Computing", + "pid": 3555714976, + "tid": 2, + "ts": "1773285888699715.976", + "dur": 1.66, + "ph": "X", + "args": {} + }, + { + "name": "Computing", + "pid": 3555714976, + "tid": 2, + "ts": "1773285888699990.041", + "dur": 1.8, + "ph": "X", + "args": {} + }, + { + "name": "Communication", + "pid": 3555714976, + "tid": 0, + "ts": "1773285888718314.848", + "dur": 23.86, + "ph": "X", + "args": {} + }, + { + "name": "Communication", + "pid": 3555714976, + "tid": 0, + "ts": "1773285888737759.276", + "dur": 96.782, + "ph": "X", + "args": {} + }, + { + "name": "Communication(Not Overlapped)", + "pid": 3555714976, + "tid": 1, + "ts": "1773285888718314.848", + "dur": 23.86, + "ph": "X", + "args": {} + }, + { + "name": "Free", + "pid": 3555714976, + "tid": 3, + "ts": "1773285888698754.717", + "dur": 961.259, + "ph": "X", + "args": {} + } +] \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/profiler_info_0.json b/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/profiler_info_0.json new file mode 100644 index 0000000..980544d --- /dev/null +++ b/data/mstx_data/mstx_profile/agent_loop_rollout_replica_0/xxx_ascend_pt/profiler_info_0.json @@ -0,0 +1,48 @@ +{ + "config": { + "common_config": { + "activities": [ + "ProfilerActivity.CPU", + "ProfilerActivity.NPU" + ], + "schedule": {}, + "record_shapes": false, + "profile_memory": false, + "with_stack": false, + "with_flops": false, + "with_modules": false + }, + "experimental_config": { + "_profiler_level": "Level1", + "_aic_metrics": "ACL_AICORE_PIPE_UTILIZATION", + "_l2_cache": false, + "_msprof_tx": false, + "_mstx": false, + "_data_simplification": false, + "record_op_args": false, + "_export_type": [ + "text" + ], + "_host_sys": [], + "_op_attr": false, + "_gc_detect_threshold": null, + "_mstx_domain_include": [], + "_mstx_domain_exclude": [], + "_sys_io": false, + "_sys_interconnection": false + } + }, + "start_info": { + "syscnt_enable": true, + "freq": 100, + "start_cnt": 238467048127792, + "start_monotonic": 2384502847213880 + }, + "end_info": { + "collectionTimeEnd": 1773285894485741210, + "MonotonicTimeEnd": 2384508822907870 + }, + "torch_npu_version": "2.7.1", + "cann_version": "8.3.RC1", + "rank_id": 0 +} \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json b/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json new file mode 100644 index 0000000..a8043ce --- /dev/null +++ b/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/ASCEND_PROFILER_OUTPUT/trace_view.json @@ -0,0 +1,146 @@ +[ + { + "name": "process_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Overlap Analysis" + } + }, + { + "name": "process_labels", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "labels": "NPU 0" + } + }, + { + "name": "process_sort_index", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 29 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Communication" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "sort_index": 0 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 1, + "ph": "M", + "args": { + "name": "Communication(Not Overlapped)" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 1, + "ph": "M", + "args": { + "sort_index": 1 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 2, + "ph": "M", + "args": { + "name": "Computing" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 2, + "ph": "M", + "args": { + "sort_index": 2 + } + }, + { + "name": "thread_name", + "pid": 3550586784, + "tid": 3, + "ph": "M", + "args": { + "name": "Free" + } + }, + { + "name": "thread_sort_index", + "pid": 3550586784, + "tid": 3, + "ph": "M", + "args": { + "sort_index": 3 + } + }, + { + "name": "Computing", + "pid": 3550586784, + "tid": 2, + "ts": "1773285903458585.417", + "dur": 109.142, + "ph": "X", + "args": {} + }, + { + "name": "Computing", + "pid": 3550586784, + "tid": 2, + "ts": "1773285903618076.644", + "dur": 22.44, + "ph": "X", + "args": {} + }, + { + "name": "Communication", + "pid": 3550586784, + "tid": 0, + "ts": "1773285903458696.819", + "dur": 159376.865, + "ph": "X", + "args": {} + }, + { + "name": "Communication(Not Overlapped)", + "pid": 3550586784, + "tid": 1, + "ts": "1773285903458696.819", + "dur": 159376.865, + "ph": "X", + "args": {} + }, + { + "name": "Free", + "pid": 3550586784, + "tid": 3, + "ts": "1773285903453279.131", + "dur": 5306.286, + "ph": "X", + "args": {} + } +] \ No newline at end of file diff --git a/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/profiler_info_0.json b/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/profiler_info_0.json new file mode 100644 index 0000000..e5d21c7 --- /dev/null +++ b/data/mstx_data/mstx_profile/ref_compute_log_prob/xxx_ascend_pt/profiler_info_0.json @@ -0,0 +1,50 @@ +{ + "config": { + "common_config": { + "activities": [ + "ProfilerActivity.CPU", + "ProfilerActivity.NPU" + ], + "schedule": {}, + "record_shapes": false, + "profile_memory": false, + "with_stack": false, + "with_flops": false, + "with_modules": false + }, + "experimental_config": { + "_profiler_level": "Level1", + "_aic_metrics": "ACL_AICORE_PIPE_UTILIZATION", + "_l2_cache": false, + "_msprof_tx": true, + "_mstx": false, + "_data_simplification": true, + "record_op_args": false, + "_export_type": [ + "db" + ], + "_host_sys": [], + "_op_attr": false, + "_gc_detect_threshold": null, + "_mstx_domain_include": [], + "_mstx_domain_exclude": [ + "communication" + ], + "_sys_io": false, + "_sys_interconnection": false + } + }, + "start_info": { + "syscnt_enable": true, + "freq": 100, + "start_cnt": 238468525914374, + "start_monotonic": 2384517625076970 + }, + "end_info": { + "collectionTimeEnd": 1773285904360767710, + "MonotonicTimeEnd": 2384518697933370 + }, + "torch_npu_version": "2.7.1", + "cann_version": "8.3.RC1", + "rank_id": "0" +} \ No newline at end of file diff --git a/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-0_031847.json.gz b/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-0_031847.json.gz new file mode 100644 index 0000000..23efafa Binary files /dev/null and b/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-0_031847.json.gz differ diff --git a/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-1_031847.json.gz b/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-1_031847.json.gz new file mode 100644 index 0000000..c833dd6 Binary files /dev/null and b/data/torch_data/torch_profile/actor_compute_log_prob/prof_rank-1_031847.json.gz differ diff --git a/data/torch_data/torch_profile/actor_update/prof_rank-0_031904.json.gz b/data/torch_data/torch_profile/actor_update/prof_rank-0_031904.json.gz new file mode 100644 index 0000000..23a26dd Binary files /dev/null and b/data/torch_data/torch_profile/actor_update/prof_rank-0_031904.json.gz differ diff --git a/data/torch_data/torch_profile/actor_update/prof_rank-1_031904.json.gz b/data/torch_data/torch_profile/actor_update/prof_rank-1_031904.json.gz new file mode 100644 index 0000000..8152f03 Binary files /dev/null and b/data/torch_data/torch_profile/actor_update/prof_rank-1_031904.json.gz differ diff --git a/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730291744166997.pt.trace.json.gz b/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730291744166997.pt.trace.json.gz new file mode 100644 index 0000000..8935ced Binary files /dev/null and b/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730291744166997.pt.trace.json.gz differ diff --git a/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730370159969000.pt.trace.json.gz b/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730370159969000.pt.trace.json.gz new file mode 100644 index 0000000..9718a72 Binary files /dev/null and b/data/torch_data/torch_profile/agent_loop_rollout_replica_0/huawei_372640.1771730370159969000.pt.trace.json.gz differ diff --git a/docs/data/data_directory.md b/docs/data/data_directory.md new file mode 100644 index 0000000..ecc90a8 --- /dev/null +++ b/docs/data/data_directory.md @@ -0,0 +1,94 @@ +# RL-Insight - 数据文件目录结构 + +## 一、采集Torch Profiling 数据目录结构 + +``` +/ +└── / + └── prof_*.json.gz +``` + +数据解析文件 prof_*.json.gz,解析文件内容包含distrubutedInfo、traceEvent等字段,数据内容一般包含ts、dur等字段,解析文件内容示例: + +``` +{ + "schemaVersion": 1, + "deviceProperties": [ + { + "id": 0, "name": "NVIDIA L20", "totalGlobalMem": 47677177856, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 92 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + } + ], + "cupti_version": 26, + "cuda_runtime_version": 12080, + "cuda_driver_version": 12080, + "distributedInfo": {"backend": "cpu:gloo,cuda:nccl", "rank": 0, "world_size": 2, "pg_count": 9, "pg_config": [{"pg_name": "0", "pg_desc": "default_pg", "backend_config": "cpu:gloo,cuda:nccl", "pg_size": 4, "ranks": [0, 1, 2, 3]}, {"pg_name": "1", "pg_desc": "mesh_dp", "backend_config": "cpu:gloo,cuda:nccl", "pg_size": 2, "ranks": [0, 2]}, {"pg_name": "3", "pg_desc": "mesh_infer_tp", "backend_config": "cpu:gloo,cuda:nccl", "pg_size": 2, "ranks": [0, 1]}, {"pg_name": "5", "pg_desc": "mesh_infer_pp", "backend_config": "cpu:gloo,cuda:nccl", "pg_size": 1, "ranks": [0]}]}, + "trace_id": "B45DDD976E4D4DDF8E3CFB28A0E2EF25", + "displayTimeUnit": "ms", + "baseTimeNanoseconds": 1767189312000000000, + "traceEvents": [ + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemGetInfo", "pid": 369418, "tid": 1722878400, + "ts": 4541015316353.111, "dur": 10083720.552, + "args": { + "cbid": 30, "correlation": 5 + } + }, + { + "name": "process_name", "ph": "M", "ts": 4541015315505.900, "pid": 369418, "tid": 0, + "args": { + "name": "ray::WorkerDict.actor_rollout_compute_log_prob" + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 4541015315462.515 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 4541019157986.427 + } + ], + "traceName": "/tmp/tmpx5qz1t66.json" +} +``` + +## 二、采集Mstx Profiling 数据目录结构 + +``` +/ +└── / + └── *_ascend_pt/ + |── profiler_info_*.json + └── ASCEND_PROFILER_OUTPUT/ + └── trace_view.json +``` + +数据解析文件 trace_view.json,解析文件内容必须包含"ph": "M",且"name": "Overlap Analysis"对应"pid"的数据,该数据一般包含ts、dur等字段,解析文件内容示例: + +``` +[ + { + "name": "process_name", + "pid": 3550586784, + "tid": 0, + "ph": "M", + "args": { + "name": "Overlap Analysis" + } + }, + { + "name": "Computing", + "pid": 3550586784, + "tid": 2, + "ts": "1773285899055563.748", + "dur": 53.301, + "ph": "X", + "args": {} + }, +] +``` diff --git a/docs/index.rst b/docs/index.rst index 063ea07..c27e32b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,6 +22,12 @@ and visualization. README <../README.md> +.. toctree:: + :maxdepth: 2 + :caption: Data Directory + + data/data_directory + Contribution ------------- diff --git a/tests/special_e2e/__init__.py b/tests/special_e2e/__init__.py new file mode 100644 index 0000000..8be1d6c --- /dev/null +++ b/tests/special_e2e/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025 verl-project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/special_e2e/test_mstx_e2e.py b/tests/special_e2e/test_mstx_e2e.py new file mode 100644 index 0000000..1958179 --- /dev/null +++ b/tests/special_e2e/test_mstx_e2e.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025 verl-project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from pathlib import Path +from rl_insight.main import main + + +def test_mstx_e2e_with_input_path(monkeypatch, tmp_path): + # Get the root directory of the project + current_file = Path(__file__).resolve() + project_root = current_file.parents[2] + + # Get the input data path + input_dir = project_root / "data" / "mstx_data" + output_dir = tmp_path / "mstx_output" + + # Ensure the input directory exists + assert input_dir.exists(), f"Input directory {input_dir} does not exist" + + # Set command line parameters + test_args = [ + "main.py", + f"--input-path={input_dir}", + f"--output-path={output_dir}", + "--profiler-type=mstx", + ] + monkeypatch.setattr(sys, "argv", test_args) + + main() + + # Verify output file + output_file = output_dir / "rl_timeline.html" + assert output_file.exists() diff --git a/tests/special_e2e/test_torch_e2e.py b/tests/special_e2e/test_torch_e2e.py new file mode 100644 index 0000000..bad5fa6 --- /dev/null +++ b/tests/special_e2e/test_torch_e2e.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025 verl-project authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +from pathlib import Path +from rl_insight.main import main + + +def test_torch_e2e_with_input_path(monkeypatch, tmp_path): + # Get the root directory of the project + current_file = Path(__file__).resolve() + project_root = current_file.parents[2] + + # Get the input data path + input_dir = project_root / "data" / "torch_data" + output_dir = tmp_path / "torch_output" + + # Ensure the input directory exists + assert input_dir.exists(), f"Input directory {input_dir} does not exist" + + # Set command line parameters + test_args = [ + "main.py", + f"--input-path={input_dir}", + f"--output-path={output_dir}", + "--profiler-type=torch", + ] + monkeypatch.setattr(sys, "argv", test_args) + + main() + + # Verify output file + output_file = output_dir / "rl_timeline.html" + assert output_file.exists()