diff --git a/deploy/kubeplus-chart/templates/kubeplus-components-6.yaml b/deploy/kubeplus-chart/templates/kubeplus-components-6.yaml index dbf41215..49423b07 100644 --- a/deploy/kubeplus-chart/templates/kubeplus-components-6.yaml +++ b/deploy/kubeplus-chart/templates/kubeplus-components-6.yaml @@ -311,6 +311,18 @@ spec: requests: storage: 1Gi # Make sure this matches the PV size --- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-models-pvc + namespace: {{ .Release.Namespace }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi +--- apiVersion: apps/v1 kind: Deployment metadata: @@ -349,6 +361,33 @@ spec: - "--secret" - "webhook-tls-certificates" containers: + - name: ollama-ai + image: ollama:latest + imagePullPolicy: IfNotPresent + env: + - name: MODEL_NAME + value: "gemma:2b" + - name: OLLAMA_MODELS + value: "/models" + - name: OLLAMA_MAX_LOADED_MODELS + value: "1" + - name: PORT + value: "8080" + ports: + - name: http + containerPort: 8080 + volumeMounts: + - name: ollama-models + mountPath: /models + resources: + requests: + cpu: "1" + memory: "3Gi" + ephemeral-storage: "1Gi" + limits: + cpu: "2" + memory: "5Gi" + ephemeral-storage: "2Gi" - name: kubeconfiggenerator image: {{ .Values.CRD_REGISTRATION_HELPER }} #gcr.io/cloudark-kubeplus/kubeconfiggenerator:3.0.27 imagePullPolicy: IfNotPresent @@ -433,6 +472,9 @@ spec: - name: webhook-certs secret: secretName: webhook-tls-certificates + - name: ollama-models + persistentVolumeClaim: + claimName: ollama-models-pvc --- apiVersion: batch/v1 kind: Job diff --git a/plugins/ai-analysis/Dockerfile b/plugins/ai-analysis/Dockerfile new file mode 100644 index 00000000..2bc97812 --- /dev/null +++ b/plugins/ai-analysis/Dockerfile @@ -0,0 +1,23 @@ +FROM ollama/ollama:latest + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + python3 python3-venv python3-pip python3-full ca-certificates curl && \ + rm -rf /var/lib/apt/lists/* + +RUN python3 -m venv /opt/venv +ENV PATH="/opt/venv/bin:${PATH}" + +RUN pip install --no-cache-dir flask ollama requests + +WORKDIR /app +COPY app.py /app/ +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +ENV MODEL_NAME=llama3 +ENV PORT=8080 + +EXPOSE 8080 + +ENTRYPOINT ["/usr/local/bin/entrypoint.sh"] diff --git a/plugins/ai-analysis/app.py b/plugins/ai-analysis/app.py new file mode 100644 index 00000000..f9dd68b2 --- /dev/null +++ b/plugins/ai-analysis/app.py @@ -0,0 +1,61 @@ +from flask import Flask, request, jsonify +from ollama import generate +import os + +app = Flask(__name__) +MODEL = os.getenv("MODEL_NAME", "gemma:2b") + +@app.get("/healthz") +def healthz(): + return "ok", 200 + +@app.route("/crailogs", methods=["POST"]) +def cr_ai_logs(): + data = request.get_json(force=True) + logs = data.get("logs", "") + prompt = """ + You are a Kubernetes SRE copilot. Analyze the raw logs below and produce a strictly formatted JSON report. + + TASKS: + 1. Provide a 1-2 line "overall_status" summarizing the observed issues. + 2. Detect incidents (SEV1-SEV3), each including: + - pods + - patterns + - sample_log + - likely_root_cause + - impact + - recommended_actions + 3. Output ONLY the following JSON structure: + + { + "title": "", + "incidents": [ + { + "pods": ["", "..."], + "patterns": ["", "..."], + "sample_log": "", + "likely_root_cause": "", + "impact": "", + "recommended_actions": ["", "..."] + } + ] + } + + RULES: + - Consider log lines as incidents if they contain keywords: ERROR, FAIL, panic, CrashLoopBackOff, OOMKilled, exception. + - For each detected incident, fill the "incidents" array with all required fields. + - Be concise; overall_status should be 1-2 lines. + - Stick strictly to the JSON structure; DO NOT add any fields or text outside the JSON. + - Do not be too verbose. Be concise and stick exactly to the JSON format. + + LOGS: + """.strip() + prompt = f"{prompt}\n{logs}" + try: + response = generate(model=f"{MODEL}", prompt=prompt) + return jsonify({"output": response.get("response", "")}), 200 + except Exception as e: + return jsonify({"error": str(e)}), 500 + +if __name__ == "__main__": + app.run(host="0.0.0.0", port=int(os.getenv("PORT", "8080"))) diff --git a/plugins/ai-analysis/entrypoint.sh b/plugins/ai-analysis/entrypoint.sh new file mode 100644 index 00000000..4054f206 --- /dev/null +++ b/plugins/ai-analysis/entrypoint.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +: "${MODEL_NAME:=gemma:2b}" +: "${OLLAMA_MODELS:=/models}" +: "${PORT:=8080}" + +echo "[entrypoint] starting ollama serve" +ollama serve & + +echo "[entrypoint] waiting for ollama..." +i=0; until curl -fsS http://127.0.0.1:11434/api/tags >/dev/null 2>&1; do + i=$((i+1)); [ $i -gt 60 ] && { echo "ollama not ready"; exit 1; } + sleep 1 +done + +echo "[entrypoint] pulling model: ${MODEL_NAME}" +ollama pull "${MODEL_NAME}" || true + +sleep 2 + +echo "[entrypoint] starting flask on :${PORT}" +exec python3 /app/app.py diff --git a/plugins/crailogs.py b/plugins/crailogs.py new file mode 100644 index 00000000..4289868b --- /dev/null +++ b/plugins/crailogs.py @@ -0,0 +1,77 @@ +import subprocess +import sys +import json +import platform +import requests +import os +from crmetrics import CRBase + +class CRLogs(CRBase): + + def _get_container_logs(self, pod, namespace, containers, kubeconfig): + container_logs = [] + for c in containers: + container = c['name'] + cmd = 'kubectl logs ' + pod + ' -n ' + namespace + ' -c ' + container + ' ' + kubeconfig + container_logs.append("======== Pod::" + pod + "/container::" + container + " ===========") + try: + out = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=True).communicate()[0] + if out: + container_logs.append(str(out)) + container_logs.append("================================================\n\n") + except Exception as e: + container_logs.append(str(e)) + + return "\n".join(container_logs) + + def get_logs(self, pod, namespace, kubeconfig): + cmd = 'kubectl get pods ' + pod + ' -n ' + namespace + ' -o json ' + kubeconfig + joined_logs = [] + try: + out = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=True).communicate()[0] + + if out: + json_output = json.loads(out) + containers = json_output['spec']['containers'] + joined_logs.append(self._get_container_logs(pod, namespace, containers, kubeconfig)) + + if 'initContainers' in json_output['spec']: + init_containers = json_output['spec']['initContainers'] + joined_logs.append(self._get_container_logs(pod, namespace, init_containers, kubeconfig)) + + except Exception as e: + joined_logs.append(str(e)) + + return "\n".join(joined_logs) + +if __name__ == '__main__': + crLogs = CRLogs() + kind = sys.argv[1] + instance = sys.argv[2] + kubeconfig = sys.argv[3] + resources = {} + + joined_logs = [] + pods = crLogs.get_pods_in_ns(kind, instance, kubeconfig) + for pod in pods: + pod_name = pod['Name'] + pod_namespace = pod['Namespace'] + joined_logs.append(crLogs.get_logs(pod_name, pod_namespace, kubeconfig)) + joined_logs.append("---------------------------------------") + + all_logs = "\n".join(joined_logs) + url = "http://localhost:8080/crailogs" + payload = {"logs": all_logs} + + try: + response = requests.post(url, json=payload) + response.raise_for_status() + result = response.json() + if 'output' in result: + print(json.dumps(result['output'], indent=2)) + except requests.exceptions.RequestException as e: + print(f"Error communicating with model service: {e}") + except ValueError: + print(f"Response was not valid JSON: {response.text}") diff --git a/plugins/kubectl-ailogs b/plugins/kubectl-ailogs new file mode 100755 index 00000000..e7703d45 --- /dev/null +++ b/plugins/kubectl-ailogs @@ -0,0 +1,64 @@ +#!/bin/bash + +source utils.sh + +print_help () { + echo "NAME" + echo " kubectl ailogs" + echo "" + echo "SYNOPSIS" + echo " kubectl ailogs -k " + echo "" + echo "DESCRIPTION" + echo " kubectl ailogs summarizes the state of container logs for all the containers of all the Pods that are related to the app instance." + exit 0 +} + +if (( $# < 4 )); then + print_help +fi + +kind=$1 +instance=$2 + +kubeconfig1="$HOME/.kube/config" # Default value + +shift; +shift; + +while getopts ":k:" opt; do + case ${opt} in + k ) + kubeconfig1=$OPTARG + if [ ! -f $kubeconfig1 ]; then + echo "Kubeconfig $kubeconfig1 does not exist." + exit 0 + fi;; + ? ) + echo "Invalid option: ${1} " 1>&2 + print_help + exit 0 + ;; + esac +done + +kubeconfig="--kubeconfig="$kubeconfig1 +if [ $# = 4 ] && [[ $4 == *"kubeconfig="* ]]; then + kubeconfig=$4 +fi + +canonicalKind=$(get_canonical_kind $kind) + +if [[ $canonicalKind == *"Unknown"* ]]; then + echo "$canonicalKind" + exit 0 +fi + +kubeplusNamespace=`kubectl get pods -A $kubeconfig | grep kubeplus-deployment | awk '{print $1}'` +resStatus=`kubectl $kubeconfig get $kind $instance -n $kubeplusNamespace -o json 2>&1` +if [[ $resStatus =~ 'Error' ]]; then + echo $resStatus + exit 0 +fi + +python /$KUBEPLUS_HOME/plugins/crailogs.py $canonicalKind $instance $kubeconfig