Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
PROJECT_ID=
GCS_BUCKET=
REGION=
ARTIFACT_REGISTRY=
CLOUD_RUN_SERVICE=

# Only set this in CI, the defaults in eval.yaml are good for local testing
GOOGLE_APPLICATION_CREDENTIALS=


1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ server

# GitHub App credentials
gha-creds-*.json
.env
64 changes: 64 additions & 0 deletions Dockerfile.eval
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Stage 1: Build the MCP Server
FROM debian:sid AS builder
Comment thread
kmontg marked this conversation as resolved.

WORKDIR /app

RUN apt-get update && apt-get install -y \
curl \
git \
sudo \
golang-go \
&& rm -rf /var/lib/apt/lists/*
COPY cicd-mcp-server/ ./cicd-mcp-server/
COPY lib/ ./lib/
COPY build.sh ./

RUN chmod +x build.sh
RUN ./build.sh

# Stage 2: Build the eval image with dependencies
FROM node:20-slim

RUN apt-get update && apt-get install -y \
curl \
procps \
python3 \
unzip \
git \
&& rm -rf /var/lib/apt/lists/*

# gcloud
RUN curl -sSL https://sdk.cloud.google.com | bash
ENV PATH=${PATH}:/root/google-cloud-sdk/bin

# Terraform
RUN curl -LO https://releases.hashicorp.com/terraform/1.14.8/terraform_1.14.8_linux_amd64.zip \
&& unzip terraform_1.14.8_linux_amd64.zip \
&& mv terraform /usr/local/bin/ \
&& rm terraform_1.14.8_linux_amd64.zip
Comment thread
kmontg marked this conversation as resolved.

# Configure cicd MCP server
COPY --from=builder /app/cicd-mcp-server/cicd-mcp-server /usr/local/bin/cicd-mcp-server
RUN mkdir -p /root/.gemini
COPY <<'EOF' /root/.gemini/settings.json
{
"mcpServers": {
"cicd": {
"command": "/usr/local/bin/cicd-mcp-server",
"timeout": 300000,
"trust": true,
"env": {
"GOOGLE_APPLICATION_CREDENTIALS": "$GOOGLE_APPLICATION_CREDENTIALS"
}
Comment thread
kmontg marked this conversation as resolved.
}
}
}
EOF

COPY <<EOF /root/.gemini/trustedFolders.json
{
"/": "TRUST_PARENT"
}
EOF

CMD ["sh"]
144 changes: 144 additions & 0 deletions eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

version: "1"

defaults:
agent: gemini
provider: docker
trials: 1 # TODO increase trials
Comment thread
kmontg marked this conversation as resolved.
timeout: 150
threshold: 0.8 # For --ci mode
grader_model: gemini-3-flash-preview # default LLM grader model
Comment thread
kmontg marked this conversation as resolved.
docker:
base: cicd-evals:latest
env:
GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/adc.json
environment:
mounts:
- ~/.config/gcloud/application_default_credentials.json:/tmp/keys/adc.json:ro
trialConfig:
setup: scripts/setup_gcloud.sh

tasks:

# Evals for simple static site: https://github.com/sdlc-graph/tinyjam.git
- name: deploy-static-site-gcs
instruction: |
Deploy this static site in ./tinyjam/example to a Google Cloud Storage bucket.
The site should be built by using tinyjam itself.

The project id will be available as an environment variable $PROJECT_ID.
The bucket name will be available as an environment variable $GCS_BUCKET.
The region to use will be available as an environment variable $REGION.


trialConfig:
setup: git clone https://github.com/sdlc-graph/tinyjam.git
cleanup: scripts/teardown-gcs.sh

graders:
- type: deterministic
run: scripts/validate-gcs-deployment.sh
weight: 0.5
- type: tool_usage
expectedTools:
- name: activate_skill
args:
name: google-cicd-deploy
weight: 0.5

# Evals for deploying simple python application: https://github.com/sdlc-graph/chefs-companion
- name: deploy-cloud-run-buildpacks-python
instruction: |
Deploy the python application in ./chefs-companion to Cloud Run

The project id will be available as an environment variable $PROJECT_ID.
The cloud run service name will be available as an environment variable $CLOUD_RUN_SERVICE.
The artifact registry to use will be available as an environment variable $ARTIFACT_REGISTRY.
The region to use will be available as an environment variable $REGION.
Use buildpacks for building and deploying the application.

timeout: 600

trialConfig:
setup: |
git clone -b buildpacks https://github.com/sdlc-graph/chefs-companion.git
cleanup: scripts/teardown-cloud-run.sh

graders:
- type: deterministic
run: scripts/validate-cloud-run-deployment.sh
weight: 1
- type: deterministic
run: |
if [ ! -f chefs-companion/Dockerfile ]; then
echo '{"score": 1, "details": "Dockerfile does not exist"}'
else
echo '{"score": 0, "details": "Dockerfile exists"}'
fi
weight: 1
- type: tool_usage
expectedTools:
- name: activate_skill
args:
name: google-cicd-deploy
- name: mcp_cicd_deploy_cloudrun_service_from_source
- name: mcp_cicd_scan_code_for_secrets
weight: 1
- type: llm_rubric
rubric: The agent must have used buildpacks to build the application
weight: 1

# Evals for deploying simple python application: https://github.com/sdlc-graph/chefs-companion
- name: deploy-cloud-run-no-dockerfile-python
instruction: |
Deploy the python application in ./chefs-companion to Cloud Run

The project id will be available as an environment variable $PROJECT_ID.
The cloud run service name will be available as an environment variable $CLOUD_RUN_SERVICE.
The artifact registry to use will be available as an environment variable $ARTIFACT_REGISTRY.
The region to use will be available as an environment variable $REGION.
Build a customer docker image for the application.

timeout: 600

trialConfig:
setup: |
git clone -b no-dockerfile https://github.com/sdlc-graph/chefs-companion.git
cleanup: scripts/teardown-cloud-run.sh

graders:
- type: deterministic
run: scripts/validate-cloud-run-deployment.sh
weight: 1
- type: deterministic
run: |
if [ -f chefs-companion/Dockerfile ]; then
echo '{"score": 1, "details": "Dockerfile exists"}'
else
echo '{"score": 0, "details": "Dockerfile does not exist"}'
fi
weight: 1
- type: tool_usage
expectedTools:
- name: activate_skill
args:
name: google-cicd-deploy
- name: mcp_cicd_deploy_cloudrun_service_from_source
- name: mcp_cicd_scan_code_for_secrets
weight: 1
- type: llm_rubric
rubric: The agent must have used a custom docker image to build the application
weight: 1
29 changes: 29 additions & 0 deletions scripts/setup_gcloud.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash
#
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create a secure temporary file
TOKEN_FILE=$(mktemp)

# Get the access token and write it to the file
if gcloud auth application-default print-access-token > "$TOKEN_FILE"; then
# Set the gcloud property
gcloud config set auth/access_token_file "$TOKEN_FILE"
echo "Successfully set auth/access_token_file to $TOKEN_FILE"
else
echo "Failed to get access token" >&2
rm -f "$TOKEN_FILE"
exit 1
fi
43 changes: 43 additions & 0 deletions scripts/teardown-cloud-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
#
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Exit immediately if a command exits with a non-zero status.
set -e

# Check for required environment variables
if [ -z "$CLOUD_RUN_SERVICE" ]; then
echo "Error: CLOUD_RUN_SERVICE environment variable is not set."
exit 1
fi

if [ -z "$PROJECT_ID" ]; then
echo "Error: PROJECT_ID environment variable is not set."
exit 1
fi

if [ -z "$REGION" ]; then
echo "Error: REGION environment variable is not set."
exit 1
fi

echo "Deleting Cloud Run service $CLOUD_RUN_SERVICE in project $PROJECT_ID and region $REGION..."

gcloud run services delete "$CLOUD_RUN_SERVICE" \
--project="$PROJECT_ID" \
--region="$REGION" \
--quiet

echo "Cloud Run service $CLOUD_RUN_SERVICE deletion command completed."
43 changes: 43 additions & 0 deletions scripts/teardown-gcs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
#
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Check if GCS_BUCKET is set
if [ -z "$GCS_BUCKET" ]; then
echo "Error: GCS_BUCKET environment variable is not set" >&2
exit 1
fi

PROJECT_ARG=""
if [ -n "$PROJECT_ID" ]; then
PROJECT_ARG="--project=$PROJECT_ID"
fi

# Attempt to delete the bucket and its contents
# We use gcloud storage rm -r to delete objects.
# We ignore errors because the bucket might be empty or not exist.
gcloud storage rm -r gs://$GCS_BUCKET/** $PROJECT_ARG &> /dev/null

# Delete the bucket
gcloud storage buckets delete gs://$GCS_BUCKET $PROJECT_ARG --quiet &> /dev/null

# Check if the bucket still exists
if gcloud storage buckets describe gs://$GCS_BUCKET $PROJECT_ARG &> /dev/null; then
echo "Error: Failed to delete bucket $GCS_BUCKET" >&2
exit 1
else
echo "Successfully deleted bucket $GCS_BUCKET"
exit 0
fi
Loading
Loading