Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/actions/test-and-report/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ inputs:
description: "Skip API access configuration and token generation (for tests that do not need a cluster)"
required: false
default: 'false'
mlflow_enabled:
description: "Whether MLflow is deployed and available for integration tests"
required: false
default: 'false'


runs:
Expand Down Expand Up @@ -264,7 +268,7 @@ runs:
DISABLE_TLS_CHECK='false'
fi

go run github.com/onsi/ginkgo/v2/ginkgo -r -v --cover -p --keep-going --github-output=true --nodes=${{ inputs.num_parallel_nodes }} --label-filter=${{ inputs.test_label }} --silence-skips=true -- -namespace=${{ inputs.default_namespace }} -multiUserMode=$MULTI_USER -useProxy=$USE_PROXY -userNamespace=${{ inputs.user_namespace }} -uploadPipelinesWithKubernetes=${{ inputs.upload_pipelines_with_kubernetes_client}} -pipelineStoreKubernetes=$pipelineStoreKubernetes -disableTlsCheck=$DISABLE_TLS_CHECK -apiScheme=$API_SCHEME -tlsEnabled=$TLS_ENABLED -caCertPath=$CA_CERT_PATH -pullNumber=$PULL_NUMBER -repoName=$REPO_NAME -apiUrl="$API_URL" -authToken="$AUTH_TOKEN" -serviceAccountName="$SERVICE_ACCOUNT_NAME" $BASE_IMAGE_FLAG
go run github.com/onsi/ginkgo/v2/ginkgo -r -v --cover -p --keep-going --github-output=true --nodes=${{ inputs.num_parallel_nodes }} --label-filter=${{ inputs.test_label }} --silence-skips=true -- -namespace=${{ inputs.default_namespace }} -multiUserMode=$MULTI_USER -useProxy=$USE_PROXY -userNamespace=${{ inputs.user_namespace }} -uploadPipelinesWithKubernetes=${{ inputs.upload_pipelines_with_kubernetes_client}} -pipelineStoreKubernetes=$pipelineStoreKubernetes -disableTlsCheck=$DISABLE_TLS_CHECK -apiScheme=$API_SCHEME -tlsEnabled=$TLS_ENABLED -caCertPath=$CA_CERT_PATH -pullNumber=$PULL_NUMBER -repoName=$REPO_NAME -apiUrl="$API_URL" -authToken="$AUTH_TOKEN" -serviceAccountName="$SERVICE_ACCOUNT_NAME" -mlflowEnabled=${{ inputs.mlflow_enabled }} $BASE_IMAGE_FLAG
continue-on-error: true

- name: Collect Pod logs in case of Test Failures
Expand Down
120 changes: 120 additions & 0 deletions .github/resources/scripts/configure-mlflow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#!/bin/bash
# Copyright 2026 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Purpose:
# This script configures KFP to use an already-deployed MLflow instance for
# MLflow E2E tests.
#
# CI helper: patch the KFP API server with plugins.mlflow, roll it out, and
# port-forward the API server and MLflow so E2E tests can reach both.
# It also exports workspace/auth variables used by MLflow test helpers.
#
# Usage: configure-mlflow.sh <KFP_NAMESPACE> <MLFLOW_NAMESPACE> <CONFIG_JSON_PATH>

set -e

KFP_NAMESPACE="${1:?KFP namespace required}"
MLFLOW_NAMESPACE="${2:?MLflow namespace required}"
CONFIG_JSON_PATH="${3:?Path to source config.json required}"

echo "Services in ${MLFLOW_NAMESPACE} namespace:"
kubectl get svc -n "$MLFLOW_NAMESPACE" --no-headers
MLFLOW_SVC=$(kubectl get svc -n "$MLFLOW_NAMESPACE" --no-headers -o custom-columns=":metadata.name" | grep -i mlflow | head -1)
if [ -z "$MLFLOW_SVC" ]; then
echo "ERROR: No service matching 'mlflow' found in namespace $MLFLOW_NAMESPACE"
exit 1
fi
MLFLOW_PORT=$(kubectl get svc -n "$MLFLOW_NAMESPACE" "$MLFLOW_SVC" -o jsonpath='{.spec.ports[0].port}')
MLFLOW_HOST="${MLFLOW_SVC}.${MLFLOW_NAMESPACE}.svc.cluster.local"
MLFLOW_STATIC_PREFIX="/mlflow"
MLFLOW_ENDPOINT="https://${MLFLOW_HOST}:${MLFLOW_PORT}${MLFLOW_STATIC_PREFIX}"
echo "MLflow service: $MLFLOW_SVC port=$MLFLOW_PORT endpoint=$MLFLOW_ENDPOINT"

MLFLOW_PATCH=$(jq -n --arg endpoint "$MLFLOW_ENDPOINT" '{
endpoint: $endpoint,
tls: { insecureSkipVerify: true },
settings: { workspacesEnabled: true }
}')

jq --argjson mlflow "$MLFLOW_PATCH" '. + { plugins: { mlflow: $mlflow } }' \
"$CONFIG_JSON_PATH" > /tmp/kfp-config.json

echo "Patched config.json plugins.mlflow:"
jq '.plugins.mlflow' /tmp/kfp-config.json

kubectl create configmap kfp-mlflow-config -n "$KFP_NAMESPACE" \
--from-file=config.json=/tmp/kfp-config.json --dry-run=client -o yaml | kubectl apply -f -
kubectl patch deployment ml-pipeline -n "$KFP_NAMESPACE" --type=strategic -p \
'{"spec":{"template":{"spec":{"volumes":[{"name":"mlflow-cfg","configMap":{"name":"kfp-mlflow-config"}}],"containers":[{"name":"ml-pipeline-api-server","volumeMounts":[{"name":"mlflow-cfg","mountPath":"/config/config.json","subPath":"config.json"}]}]}}}}'
kubectl rollout status deployment/ml-pipeline -n "$KFP_NAMESPACE" --timeout=180s

pkill -f "kubectl port-forward.*ml-pipeline.*8888" || true
sleep 2

C_DIR="${BASH_SOURCE%/*}"
"${C_DIR}/forward-port.sh" "$KFP_NAMESPACE" ml-pipeline 8888 8888

for i in $(seq 1 12); do
if curl -sf http://localhost:8888/apis/v1beta1/healthz > /dev/null 2>&1; then
echo "API server is healthy on localhost:8888"
break
fi
echo "Waiting for API server to become healthy... ($i/12)"
sleep 5
done
curl -sf http://localhost:8888/apis/v1beta1/healthz > /dev/null 2>&1 || {
echo "ERROR: API server not reachable at localhost:8888"
exit 1
}

SA_TOKEN=$(kubectl create token ml-pipeline -n "$KFP_NAMESPACE" --duration=1h 2>/dev/null || true)
if [ -n "${GITHUB_ENV:-}" ]; then
echo "MLFLOW_WORKSPACE=$KFP_NAMESPACE" >> "$GITHUB_ENV"
# Later workflow steps need these to re-establish port-forward: background jobs from this step
# are terminated when the step exits, so test-and-report starts kubectl port-forward again.
echo "MLFLOW_PORT_FORWARD_NS=$MLFLOW_NAMESPACE" >> "$GITHUB_ENV"
echo "MLFLOW_PORT_FORWARD_SVC=$MLFLOW_SVC" >> "$GITHUB_ENV"
echo "MLFLOW_PORT_FORWARD_REMOTE_PORT=$MLFLOW_PORT" >> "$GITHUB_ENV"
if [ -n "$SA_TOKEN" ]; then
echo "MLFLOW_BEARER_TOKEN=$SA_TOKEN" >> "$GITHUB_ENV"
echo "Exported MLFLOW_BEARER_TOKEN and MLFLOW_WORKSPACE for test helpers"
else
echo "WARNING: Could not create SA token; MLflow requests may be unauthenticated"
echo "Exported MLFLOW_WORKSPACE only"
fi
fi

kubectl port-forward -n "$MLFLOW_NAMESPACE" "svc/$MLFLOW_SVC" "8080:$MLFLOW_PORT" &
sleep 3

HEALTH_URL="https://localhost:8080${MLFLOW_STATIC_PREFIX}/health"
CURL_HEADERS=(-H "X-MLflow-Workspace: $KFP_NAMESPACE")
[ -n "$SA_TOKEN" ] && CURL_HEADERS+=(-H "Authorization: Bearer $SA_TOKEN")

STATUS=000
for i in $(seq 1 30); do
STATUS=$(curl -sk -o /dev/null -w '%{http_code}' --connect-timeout 5 --max-time 10 \
"${CURL_HEADERS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
if [ "$STATUS" != "000" ] && [ "$STATUS" -lt 500 ] 2>/dev/null; then
echo "MLflow backend is healthy on localhost:8080 (HTTPS, status=$STATUS)"
break
fi
echo "Waiting for MLflow backend... ($i/30, status=$STATUS)"
sleep 5
done
if [ "$STATUS" = "000" ] || { [ "$STATUS" -ge 500 ] 2>/dev/null; }; then
echo "ERROR: MLflow backend not healthy after 30 attempts (last status=$STATUS)"
exit 1
fi
120 changes: 120 additions & 0 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ env:
PYTHON_VERSION: "3.9"
USER_NAMESPACE: "kubeflow-user-example-com"
CA_CERT_PATH: ""
AWS_ACCESS_KEY_ID: 'minio'
AWS_SECRET_ACCESS_KEY: 'minio123'
AWS_S3_BUCKET: 'mlpipeline'
MLFLOW_TRACKING_URI: "https://localhost:8080/mlflow"
MLFLOW_TRACKING_INSECURE_TLS: "true"

on:
push:
Expand Down Expand Up @@ -256,3 +261,118 @@ jobs:
python_version: ${{ env.PYTHON_VERSION }}
user_namespace: ${{ env.USER_NAMESPACE }}
report_name: "E2EMultiUserTests_K8s=${{ matrix.k8s_version }}_cacheEnabled=${{ matrix.cache_enabled }}_multiUser=${{ matrix.multi_user }}"


end-to-end-critical-mlflow-tests:
runs-on: ubuntu-latest
needs: build
strategy:
matrix:
k8s_version: [ "v1.34.0" ]
cache_enabled: [ "true", "false" ]
argo_version: [ "v3.7.3" ]
proxy: [ "false" ]
test_label: [ "MLflow" ]
pod_to_pod_tls_enabled: [ "false" ]
multi_user: [ "false" ]
artifact_proxy: [ "false" ]
artifact_storage: [ "file", "s3" ]
backend_store: [ "postgres" ]
registry_store: [ "postgres" ]
fail-fast: false
name: End to End Critical Scenario MLflow Tests - K8s ${{ matrix.k8s_version }} cacheEnabled=${{ matrix.cache_enabled }} artifactStorage=${{ matrix.artifact_storage }}
steps:
- name: Checkout code
uses: actions/checkout@v5

- name: Create cluster
uses: ./.github/actions/create-cluster
id: create-cluster
with:
k8s_version: ${{ matrix.k8s_version }}
cluster_name: ${{ env.CLUSTER_NAME }}

- name: Deploy KFP
uses: ./.github/actions/deploy
if: ${{ steps.create-cluster.outcome == 'success' }}
id: deploy
with:
cache_enabled: ${{ matrix.cache_enabled }}
argo_version: ${{ matrix.argo_version }}
pod_to_pod_tls_enabled: ${{ matrix.pod_to_pod_tls_enabled }}
multi_user: ${{ matrix.multi_user }}
artifact_proxy: ${{ matrix.artifact_proxy }}
image_path: ${{ needs.build.outputs.IMAGE_PATH }}
image_tag: ${{ needs.build.outputs.IMAGE_TAG }}
image_registry: ${{ needs.build.outputs.IMAGE_REGISTRY }}

- name: Deploy MLflow
id: deploy-mlflow
uses: opendatahub-io/mlflow-operator/.github/actions/deploy@8ab07a89d6d2d6bc2ffa0c8601f4a856a4cb1b18
if: ${{ steps.create-cluster.outcome == 'success' }}
with:
namespace: 'opendatahub'
mlflow_image: quay.io/${{ github.repository_owner == 'red-hat-data-services' && 'rhoai' || 'opendatahub' }}/mlflow:odh-stable
mlflow_operator_image: quay.io/${{ github.repository_owner == 'red-hat-data-services' && 'rhoai' || 'opendatahub' }}/mlflow-operator:odh-stable
backend_store: ${{ matrix.backend_store }}
artifact_storage: ${{ matrix.artifact_storage }}
registry_store: ${{ matrix.registry_store }}
s3_access_key: ${{ env.AWS_ACCESS_KEY_ID }}
s3_secret_key: ${{ env.AWS_SECRET_ACCESS_KEY }}

- name: Wait for MLflow stack readiness
shell: bash
if: ${{ steps.deploy-mlflow.outcome == 'success' }}
run: |
kubectl wait --for=condition=Ready pods --field-selector=status.phase=Running -n opendatahub --timeout=180s
echo "All running pods in opendatahub are Ready"

- name: Configure MLflow Plugin and Forward Ports
shell: bash
id: configure-mlflow-plugin
if: ${{ steps.deploy.outcome == 'success' && steps.deploy-mlflow.outcome == 'success' }}
run: ./.github/resources/scripts/configure-mlflow.sh kubeflow opendatahub backend/src/apiserver/config/config.json

- name: Configure Input Variables
shell: bash
id: configure
if: ${{ steps.deploy.outcome == 'success' }}
run: |
NUMBER_OF_NODES=${{ env.NUMBER_OF_PARALLEL_NODES }}
TEST_LABEL=${{ matrix.test_label }}
NAMESPACE=${{ env.NAMESPACE }}
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
NUMBER_OF_NODES=${{ inputs.number_of_parallel_tests }}
TEST_LABEL=${{ inputs.test_label }}
NAMESPACE=${{ inputs.namespace }}
fi

{
echo "NUMBER_OF_NODES=$NUMBER_OF_NODES"
echo "TEST_LABEL=$TEST_LABEL"
echo "NAMESPACE=$NAMESPACE"
} >> "$GITHUB_OUTPUT"

- name: Build and upload the sample Modelcar image to Kind
id: build-sample-modelcar-image
if: ${{ steps.deploy.outcome == 'success' }}
run: |
docker build -f ./test_data/sdk_compiled_pipelines/valid/critical/modelcar/Dockerfile -t registry.domain.local/modelcar:test .
kind --name kfp load docker-image registry.domain.local/modelcar:test
continue-on-error: true

- name: Run Tests
uses: ./.github/actions/test-and-report
id: test-run
if: ${{ steps.configure.outcome == 'success' && steps.deploy-mlflow.outcome == 'success' && steps.configure-mlflow-plugin.outcome == 'success'}}
with:
cache_enabled: ${{ matrix.cache_enabled }}
test_directory: ${{ env.E2E_TESTS_DIR }}
test_label: ${{ steps.configure.outputs.TEST_LABEL }}
num_parallel_nodes: ${{ steps.configure.outputs.NUMBER_OF_NODES }}
default_namespace: ${{ steps.configure.outputs.NAMESPACE }}
python_version: ${{ env.PYTHON_VERSION }}
report_name: "MLflowTests_K8s=${{ matrix.k8s_version }}_cacheEnabled=${{ matrix.cache_enabled }}_artifactStorage=${{ matrix.artifact_storage }}"
tls_enabled: ${{ matrix.pod_to_pod_tls_enabled }}
ca_cert_path: ${{ env.CA_CERT_PATH }}
mlflow_enabled: 'true'
5 changes: 0 additions & 5 deletions .github/workflows/presubmit-backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,8 @@ on:
branches:
- master
pull_request:
branches:
- master
paths:
- 'backend/**'
- 'test/presubmit-backend-test.sh'
- '!**/*.md'
- '!**/OWNERS'

jobs:
backend-tests:
Expand Down
12 changes: 3 additions & 9 deletions backend/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,11 @@ TLS_ENABLED ?= "false"
CERT_MANAGER_VERSION ?= v1.16.2

# Container Build Params
CONTAINER_ENGINE ?= $(shell \
if command -v docker >/dev/null 2>&1; then \
echo docker; \
elif command -v podman >/dev/null 2>&1; then \
echo podman; \
fi \
)
CONTAINER_ENGINE ?= podman

# IMG_REGISTRY can be used to automatically prepend registry details. e.g. "quay.io/kubeflow/"
IMG_REGISTRY ?=
IMG_TAG_APISERVER ?= apiserver
IMG_REGISTRY ?= quay.io/rh-ee-agoins/
IMG_TAG_APISERVER ?= apiserver:1007
IMG_TAG_PERSISTENCEAGENT ?= persistence-agent
IMG_TAG_CACHESERVER ?= cache-server
IMG_TAG_SCHEDULEDWORKFLOW ?= scheduledworkflow
Expand Down
Loading
Loading