-
Notifications
You must be signed in to change notification settings - Fork 216
158 lines (151 loc) · 8.14 KB
/
weekly.yml
File metadata and controls
158 lines (151 loc) · 8.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
name: Weekly Tests
on:
workflow_dispatch: # manually dispatch
# push:
schedule:
- cron: '0 20 * * FRI' # 8:00 PM every Friday
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
jobs:
Tracer-Weekly:
timeout-minutes: 720
if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }}
runs-on: tgrogers-raid
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
ref: dev
- name: Setup Environment
run: |
rm -rf env-setup
git clone --quiet git@github.com:purdue-aalp/env-setup.git
cd env-setup
git checkout cluster-ubuntu
- name: Build Tracer
run: |
source ./env-setup/12.8_env_setup.sh
./util/tracer_nvbit/install_nvbit.sh
make clean -C ./util/tracer_nvbit/
make -C ./util/tracer_nvbit/
- name: build applications
run: |
source ./env-setup/12.8_env_setup.sh
export PATH=/home/tgrogers-raid/a/common/python2:$PATH
rm -rf ./gpu-app-collection/
git clone --quiet --recurse-submodules https://github.com/accel-sim/gpu-app-collection.git
source ./gpu-app-collection/src/setup_environment
ln -s /home/tgrogers-raid/a/common/data_dirs ./gpu-app-collection/
make -j8 -C ./gpu-app-collection/src rodinia_2.0-ft
make -j8 -C ./gpu-app-collection/src rodinia-3.1
make -j8 -C ./gpu-app-collection/src GPU_Microbenchmark
# make -j8 -C ./gpu-app-collection/src Deepbench_nvidia
# make -j8 -C ./gpu-app-collection/src parboil
# make -j8 -C ./gpu-app-collection/src polybench
# make -j8 -C ./gpu-app-collection/src cutlass
- name: generate traces
run: |
source ./env-setup/12.8_env_setup.sh
source ./gpu-app-collection/src/setup_environment
rm -rf ./hw_run/
srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/hw_stats/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run
# ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7
- name: generate-spinlock-traces-spinlock_handling
run: |
source ./env-setup/12.8_env_setup.sh
source ./gpu-app-collection/src/setup_environment
rm -rf ./hw_run/
mkdir -p ./hw_run/
srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward
mkdir -p ./hw_run/
srun --job-name=gpu-lock --dependency=singleton --partition=tgrogers-dgx -- ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none
SASS-Weekly:
timeout-minutes: 720
needs: [Tracer-Weekly]
if: ${{ github.repository == 'accel-sim/accel-sim-framework' || github.event_name == 'workflow_dispatch' }}
runs-on: tgrogers-raid
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v4
with:
ref: dev
- name: Setup Environment
run: |
rm -rf env-setup
git clone --quiet git@github.com:purdue-aalp/env-setup.git
cd env-setup
git checkout cluster-ubuntu
- name: Build Accel-Sim
run: |
source ./env-setup/12.8_env_setup.sh
rm -rf ./gpu-simulator/gpgpu-sim
# Clone gpgpu-sim with fork-aware branch selection
echo "Cloning gpgpu-sim with fork-aware branch selection..."
git clone --quiet git@github.com:accel-sim/gpgpu-sim_distribution.git ./gpu-simulator/gpgpu-sim
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
current_branch=$BRANCH_NAME
current_repo=$(echo $GITHUB_REPOSITORY | cut -d'/' -f2)
gpgpusim_repo=$(echo $current_repo | sed 's/accel-sim-framework/gpgpu-sim_distribution/')
echo "Attempting to checkout branch '$BRANCH_NAME' from '$current_owner/$gpgpusim_repo'"
# First, try to add the fork owner's repository as a remote and check if the branch exists
if git -C ./gpu-simulator/gpgpu-sim/ remote add fork-owner git@github.com:$current_owner/$gpgpusim_repo.git 2>/dev/null; then
# Check if the branch exists in the fork owner's repository
if git -C ./gpu-simulator/gpgpu-sim/ ls-remote fork-owner | grep -q "refs/heads/$BRANCH_NAME"; then
echo "Found branch '$BRANCH_NAME' in '$current_owner/$gpgpusim_repo' repository, checking it out"
git -C ./gpu-simulator/gpgpu-sim/ fetch fork-owner
git -C ./gpu-simulator/gpgpu-sim/ checkout -B $BRANCH_NAME fork-owner/$BRANCH_NAME
else
echo "Branch '$BRANCH_NAME' not found in '$current_owner/$gpgpusim_repo' repository, falling back to accel-sim dev branch"
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
fi
# Remove the temporary remote
git -C ./gpu-simulator/gpgpu-sim/ remote remove fork-owner
else
echo "Could not add '$current_owner/$gpgpusim_repo' remote, falling back to upstream dev branch"
git -C ./gpu-simulator/gpgpu-sim/ checkout -B dev origin/dev
fi
source ./gpu-simulator/setup_environment.sh
make clean -C gpu-simulator
make -j -C gpu-simulator
- name: run SASS
run: |
source ./env-setup/12.8_env_setup.sh
source ./gpu-simulator/setup_environment.sh
ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run ./hw_run
./util/job_launching/run_simulations.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -C QV100-SASS -T ./hw_run/traces/device-7/12.8 -N weekly-$$ -M 70G
./util/job_launching/monitor_func_test.py -T 12 -S 300 -v -s weekly-stats-per-app.csv -N weekly-$$
- name: test-new-traces-spinlock_handling
# Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr)
run: |
source ./env-setup/12.8_env_setup.sh
source ./gpu-simulator/setup_environment.sh
./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward
./util/job_launching/monitor_func_test.py -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward
# ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none
# ./util/job_launching/monitor_func_test.py -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none
failures:
if: failure()
env:
ACTION_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
REPORT_URL: ""
runs-on: tgrogers-raid
needs: [Tracer-Weekly, SASS-Weekly]
steps:
- uses: actions/checkout@v4
- name: Notify Failure
run: |
# Setup envs
git clone --quiet --branch cluster-ubuntu git@github.com:purdue-aalp/env-setup.git
source ./env-setup/common/common_inc.sh
export BRANCH_NAME="Weekly Tests"
python3 .github/scripts/send_ci_email.py -t failure