forked from kubeflow/trainer
-
Notifications
You must be signed in to change notification settings - Fork 0
99 lines (84 loc) · 4.44 KB
/
test-e2e.yaml
File metadata and controls
99 lines (84 loc) · 4.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
name: E2E Tests
on:
pull_request:
permissions:
contents: read
pull-requests: read
jobs:
cpu-e2e-test:
name: CPU E2E Test
runs-on: oracle-vm-16cpu-64gb-x86-64
env:
GOPATH: ${{ github.workspace }}/go
defaults:
run:
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
strategy:
fail-fast: false
matrix:
# Kubernetes versions for e2e tests on Kind cluster.
kubernetes-version: ["1.32.3", "1.33.1", "1.34.0", "1.35.0"]
steps:
- name: Checkout repository for local actions
uses: actions/checkout@v6
- name: Setup CPU Cluster
uses: ./.github/workflows/template-setup-clusters
with:
cluster_type: 'cpu'
kubernetes_version: ${{ matrix.kubernetes-version }}
- name: Run e2e with Go
run: |
make test-e2e || (kubectl logs -n kubeflow-system -l app.kubernetes.io/name=trainer && exit 1)
- name: Run e2e test for example Notebooks
run: |
mkdir -p artifacts/notebooks
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_mnist.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/question-answering/fine-tune-distilbert.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_fine-tune-distilbert.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/speech-recognition/speech-recognition.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_speech-recognition.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/pytorch/audio-classification/audio-classification.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_audio-classification.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/local/local-training-mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_local-training-mnist.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/local/local-container-mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_local-container-mnist.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/xgboost/distributed-training/xgboost-distributed.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_xgboost-distributed.ipynb PAPERMILL_TIMEOUT=1800
# TODO (andreyvelich): Discuss how we can upload artifacts for multiple Notebooks.
- name: Upload Artifacts to GitHub
uses: actions/upload-artifact@v7
if: always()
with:
name: cpu-${{ matrix.kubernetes-version }}
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
retention-days: 1
gpu-e2e-test:
name: GPU E2E Test
timeout-minutes: 120
runs-on:
labels: oracle-vm-gpu-a10-2
group: GPUs
env:
GOPATH: ${{ github.workspace }}/go
defaults:
run:
working-directory: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer
strategy:
fail-fast: false
matrix:
kubernetes-version: ["1.33.1"]
steps:
- name: Checkout repository for local actions
uses: actions/checkout@v6
- name: Setup GPU Cluster
uses: ./.github/workflows/template-setup-clusters
with:
cluster_type: 'gpu'
kubernetes_version: ${{ matrix.kubernetes-version }}
- name: Run e2e test on GPU cluster
run: |
mkdir -p artifacts/notebooks
make test-e2e-notebook NOTEBOOK_INPUT=./examples/torchtune/qwen2_5/qwen2.5-1.5B-with-alpaca.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_qwen2_5_with_alpaca-trainjob-yaml.ipynb PAPERMILL_TIMEOUT=1800
make test-e2e-notebook NOTEBOOK_INPUT=./examples/jax/image-classification/mnist.ipynb NOTEBOOK_OUTPUT=./artifacts/notebooks/${{ matrix.kubernetes-version }}_jax_mnist.ipynb PAPERMILL_PARAMS="-p num_cpu 8 -p num_gpu 2 -p num_nodes 1" PAPERMILL_TIMEOUT=1800
- name: Upload Artifacts to GitHub
if: always()
uses: actions/upload-artifact@v7
with:
name: gpu-${{ matrix.kubernetes-version }}
path: ${{ env.GOPATH }}/src/github.com/kubeflow/trainer/artifacts/*
retention-days: 1