From 1e32e7aac6aed4ce694d645f13b1eae2e68763b9 Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sat, 16 May 2026 08:35:42 +0530 Subject: [PATCH 01/14] feat(release): Helm chart distribution to Artifact Hub and multi-arch builds - Added helm-release.yml workflow for automated chart publishing. - Added artifacthub-repo.yml and Chart.yaml annotations for Artifact Hub. - Updated ci.yml for multi-arch (amd64/arm64) Docker builds. - Added Helm chart README and updated main README instructions. Signed-off-by: Sanjaykumar.M --- .github/workflows/ci.yml | 4 +++ .github/workflows/helm-release.yml | 49 +++++++++++++++++++++++++++ README.md | 8 +++-- artifacthub-repo.yml | 6 ++++ deploy/helm/kerno/Chart.yaml | 9 +++++ deploy/helm/kerno/README.md | 54 ++++++++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/helm-release.yml create mode 100644 artifacthub-repo.yml create mode 100644 deploy/helm/kerno/README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb96740..c4f5e2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -152,6 +152,9 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Set up QEMU + uses: docker/setup-qemu-action@v4 + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 @@ -160,6 +163,7 @@ jobs: with: context: . push: false + platforms: linux/amd64,linux/arm64 tags: ghcr.io/optiqor/kerno:ci cache-from: type=gha cache-to: type=gha,mode=max diff --git a/.github/workflows/helm-release.yml b/.github/workflows/helm-release.yml new file mode 100644 index 0000000..d1119ef --- /dev/null +++ b/.github/workflows/helm-release.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Optiqor contributors +# SPDX-License-Identifier: Apache-2.0 + +name: Helm Chart Release + +on: + push: + tags: + - "v*" + +permissions: + contents: write + +jobs: + release-chart: + name: Release Chart + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Install Helm + uses: azure/setup-helm@v4 + + - name: Run chart-releaser + uses: helm/chart-releaser-action@v1 + with: + charts_dir: deploy/helm + env: + CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + + # After chart-releaser finishes, it has updated gh-pages. + # We need to ensure artifacthub-repo.yml is in the root of gh-pages. + - name: Push Artifact Hub metadata + run: | + git checkout gh-pages + cp .github/artifacthub-repo.yml ./artifacthub-repo.yml || cp artifacthub-repo.yml ./artifacthub-repo.yml || true + if [ -f artifacthub-repo.yml ]; then + git add artifacthub-repo.yml + git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" + git push origin gh-pages + fi diff --git a/README.md b/README.md index bc1ece7..186813f 100644 --- a/README.md +++ b/README.md @@ -118,12 +118,16 @@ Kerno is the only eBPF tool in the Kubernetes ecosystem that produces a ranked, ### 1 · Kubernetes (primary) ```bash -helm install kerno ./deploy/helm/kerno \ +helm repo add kerno https://optiqor.github.io/kerno +helm repo update +helm install kerno kerno/kerno \ -n kerno-system --create-namespace ``` Within 30 seconds Kerno is running as a DaemonSet on every node, watching the kernel via eBPF, exposing `/metrics` for Prometheus, and ready for `kerno doctor`. +> **Tip:** If you prefer to install from a local clone: `helm install kerno ./deploy/helm/kerno -n kerno-system --create-namespace` + ```bash # Cluster-wide incident report - 30 seconds of real kernel data kubectl -n kerno-system exec ds/kerno -- kerno doctor @@ -167,7 +171,7 @@ docker run --rm --privileged --pid=host \ ghcr.io/optiqor/kerno:latest doctor ``` -Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. +Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. Graviton, Apple Silicon, and Raspberry Pi clusters work out of the box. --- diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml new file mode 100644 index 0000000..54bccf2 --- /dev/null +++ b/artifacthub-repo.yml @@ -0,0 +1,6 @@ +# Artifact Hub repository metadata +# https://artifacthub.io/docs/topics/repositories/helm-charts/ +repositoryID: 00000000-0000-0000-0000-000000000000 # Placeholder: update with real ID after manual submission +owners: + - name: Shivam Kumar + email: btwshivam@example.com # Replace with real maintainer email if known diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index 4e527b5..90ec66e 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -16,3 +16,12 @@ keywords: - kernel - prometheus - monitoring +annotations: + artifacthub.io/license: Apache-2.0 + artifacthub.io/signKey: "" + artifacthub.io/containsSecurityUpdates: "false" + artifacthub.io/prerelease: "false" + artifacthub.io/maintainers: | + - name: Shivam Kumar + github: btwshivam +icon: https://raw.githubusercontent.com/optiqor/kerno/main/demo.gif # Using demo.gif as a placeholder until a logo is available diff --git a/deploy/helm/kerno/README.md b/deploy/helm/kerno/README.md new file mode 100644 index 0000000..49409b5 --- /dev/null +++ b/deploy/helm/kerno/README.md @@ -0,0 +1,54 @@ +# Kerno Helm Chart + +Kerno is an eBPF-based kernel observability engine for Kubernetes. It diagnoses production incidents by watching kernel signals (disk, TCP, OOM, scheduler) and providing a ranked diagnostic report. + +## Prerequisites + +- Kubernetes 1.22+ +- Helm 3.8.0+ +- Linux kernel 5.8+ with BTF enabled (standard on EKS, GKE, AKS, etc.) + +## Installation + +```bash +helm repo add kerno https://optiqor.github.io/kerno +helm repo update +helm install kerno kerno/kerno -n kerno-system --create-namespace +``` + +## Configuration + +The following table lists the most common configurable parameters of the Kerno chart and their default values. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `image.repository` | Image repository | `ghcr.io/optiqor/kerno` | +| `image.tag` | Image tag | `{{ .Chart.AppVersion }}` | +| `resources.requests.cpu` | CPU requests | `100m` | +| `resources.requests.memory` | Memory requests | `128Mi` | +| `prometheus.enabled` | Enable Prometheus metrics | `true` | +| `collectors.syscallLatency` | Enable syscall latency collector | `true` | +| `collectors.tcpMonitor` | Enable TCP monitor collector | `true` | +| `collectors.oomTrack` | Enable OOM tracker | `true` | +| `collectors.diskIO` | Enable Disk I/O collector | `true` | +| `collectors.schedDelay` | Enable scheduler delay collector | `true` | +| `collectors.fdTrack` | Enable file descriptor tracker | `true` | + +For a full list of parameters, see [values.yaml](values.yaml). + +## Examples + +### Enable AI Diagnosis +```bash +helm install kerno kerno/kerno \ + --set extraEnv[0].name=KERNO_AI_API_KEY \ + --set extraEnv[0].value=your-key \ + --set extraEnv[1].name=KERNO_AI_PROVIDER \ + --set extraEnv[1].value=anthropic +``` + +## Version Compatibility Matrix + +| Kerno Version | K8s Version | Kernel Version | +|---------------|-------------|----------------| +| v0.1.x | 1.22 - 1.31 | 5.8+ | From b8402a86376bdd00f117c53d6b0f4ba167353ab8 Mon Sep 17 00:00:00 2001 From: btwshivam Date: Sun, 17 May 2026 02:50:52 +0530 Subject: [PATCH 02/14] feat(github): welcome first-time commenters with /assign nudge and star links Signed-off-by: btwshivam --- .github/workflows/welcome-commenter.yml | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 .github/workflows/welcome-commenter.yml diff --git a/.github/workflows/welcome-commenter.yml b/.github/workflows/welcome-commenter.yml new file mode 100644 index 0000000..7f4d715 --- /dev/null +++ b/.github/workflows/welcome-commenter.yml @@ -0,0 +1,52 @@ +# Copyright 2026 Optiqor contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Welcomes a contributor on their first issue comment. Sister to +# welcome.yml, which only fires on issue/PR open. + +name: Welcome first-time commenters + +on: + issue_comment: + types: [created] + +permissions: + issues: write + +jobs: + welcome: + if: | + github.event.comment.user.type != 'Bot' && + github.event.comment.user.login != github.repository_owner && + github.event.comment.user.login != github.event.issue.user.login && + !github.event.issue.pull_request + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v9 + with: + script: | + const repo = { owner: context.repo.owner, repo: context.repo.repo }; + const commenter = context.payload.comment.user.login; + const issueNumber = context.payload.issue.number; + + // Skip if the user has already commented on a different issue. + const res = await github.rest.search.issuesAndPullRequests({ + q: `repo:${repo.owner}/${repo.repo} commenter:${commenter} is:issue`, + per_page: 10, + }); + if ((res.data.items || []).some(i => i.number !== issueNumber)) { + core.info(`@${commenter} is not a first-time commenter; skipping`); + return; + } + + const body = + `Hey @${commenter}, welcome. Looks like this is your first comment in the repo.\n\n` + + `Want to work on this issue? Reply with \`/assign\` or \`/take\` to claim it.\n\n` + + `If the work's been useful, two quick ways to help:\n\n` + + `⭐ [Star kerno](https://github.com/optiqor/kerno): eBPF kernel diagnosis engine\n` + + `⭐ [Star optiqor-cli](https://github.com/optiqor/optiqor-cli): Kubernetes cost remediation that lives in the pull request\n\n` + + `Thanks for showing up.`; + + await github.rest.issues.createComment({ + ...repo, issue_number: issueNumber, body, + }); From eb7ed7f6e8e30b921854cbef30ef3fef22e2733f Mon Sep 17 00:00:00 2001 From: btwshivam Date: Sun, 17 May 2026 03:04:24 +0530 Subject: [PATCH 03/14] feat(github): GSSoC auto-labels for difficulty and mentor attribution Signed-off-by: btwshivam --- .github/workflows/gssoc-difficulty.yml | 140 +++++++++++++++++++++++++ .github/workflows/gssoc-mentor.yml | 65 ++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 .github/workflows/gssoc-difficulty.yml create mode 100644 .github/workflows/gssoc-mentor.yml diff --git a/.github/workflows/gssoc-difficulty.yml b/.github/workflows/gssoc-difficulty.yml new file mode 100644 index 0000000..df8071f --- /dev/null +++ b/.github/workflows/gssoc-difficulty.yml @@ -0,0 +1,140 @@ +# Copyright 2026 Optiqor contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Auto-applies a level:* difficulty label to every PR. One label per PR. +# Re-runs on synchronize, so a PR that grows out of its initial bucket +# gets re-classified on each push. +# +# Classification (strict; checked top-to-bottom, first match wins): +# +# level:critical — touches BPF C, AI, daemon entry, release tooling, +# install script, Dockerfile, security workflows, +# or K8s RBAC / security context. Size-independent. +# +# level:beginner — ONLY trivial paths (docs, templates, fixtures, +# .gitignore) AND ≤ 5 files. Any size in that scope. +# A README rewrite is beginner. A typo is beginner. +# +# level:advanced — 400+ total lines OR 9+ files; OR a code path +# (internal/, pkg/, cmd/) with 100+ lines or 5+ files. +# +# level:intermediate — everything else. Default bucket for code work +# under 100 lines, and for non-code config tweaks +# that aren't trivial docs. +# +# If you need to override, comment with /level — not wired yet +# but the override path is reserved. + +name: GSSoC difficulty label + +on: + pull_request_target: + types: [opened, synchronize, reopened] + +permissions: + contents: read + pull-requests: write + +jobs: + classify: + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v9 + with: + script: | + const repo = { owner: context.repo.owner, repo: context.repo.repo }; + const pr = context.payload.pull_request; + const prNumber = pr.number; + + const files = await github.paginate(github.rest.pulls.listFiles, { + ...repo, pull_number: prNumber, per_page: 100, + }); + const paths = files.map(f => f.filename); + + // ── Critical surface (size-independent) ──────────────────── + // + // A bug here can cause kernel-verifier rejection, capability + // widening, secret leakage, or a bad release artifact. + const criticalPatterns = [ + /^internal\/bpf\/c\//, // BPF C source — verifier risk + /^internal\/bpf\/.*loader\.go$/, // BPF program loaders + /^internal\/ai\//, // privacy + API key handling + /^internal\/cli\/start\.go$/, // daemon entry + capabilities + /^Dockerfile/, // production image + /^scripts\/install\.sh$/, // root install + supply chain + /^SECURITY\.md$/, + /^\.goreleaser/, + /^\.github\/workflows\/(release|codeql|cosign|security)/, + /^deploy\/.*(rbac|security[Cc]ontext|securitycontext)/, + ]; + + // ── Code paths (intermediate floor) ──────────────────────── + // + // Any non-trivial change in these paths is at least intermediate. + // Touching the doctor engine, a collector, or a CLI command + // requires understanding the codebase — not beginner work. + const codePathPatterns = [ + /^internal\//, + /^pkg\//, + /^cmd\//, + ]; + + // ── Trivial paths (beginner ceiling) ──────────────────────── + // + // PRs that touch ONLY these paths qualify for beginner regardless + // of line count. Docs, config, fixtures, issue/PR templates. + const trivialPatterns = [ + /\.md$/, + /\.gitignore$/, + /\.editorconfig$/, + /^docs\//, + /^testdata\//, + /^demo\./, + /^\.github\/ISSUE_TEMPLATE\//, + /^\.github\/PULL_REQUEST_TEMPLATE\.md$/, + /^\.github\/mlc-config\.json$/, + /^\.github\/labeler\.yml$/, + /^\.github\/dependabot\.yml$/, + /^LICENSE$/, + /^CODE_OF_CONDUCT\.md$/, + /^GOVERNANCE\.md$/, + ]; + + const lines = (pr.additions || 0) + (pr.deletions || 0); + const fileCount = paths.length; + const touchesCritical = paths.some(p => criticalPatterns.some(re => re.test(p))); + const touchesCode = paths.some(p => codePathPatterns.some(re => re.test(p))); + const allTrivial = paths.length > 0 && paths.every(p => trivialPatterns.some(re => re.test(p))); + + let level; + if (touchesCritical) { + level = 'level:critical'; + } else if (allTrivial && fileCount <= 5) { + level = 'level:beginner'; + } else if (lines >= 400 || fileCount >= 9) { + level = 'level:advanced'; + } else if (touchesCode && (lines >= 100 || fileCount >= 5)) { + level = 'level:advanced'; + } else if (touchesCode) { + level = 'level:intermediate'; + } else { + level = 'level:intermediate'; + } + + core.info( + `PR #${prNumber}: ${lines} lines, ${fileCount} files, ` + + `critical=${touchesCritical}, code=${touchesCode}, allTrivial=${allTrivial} → ${level}` + ); + + // Strip any other level:* label, then apply the chosen one. + const currentLabels = (pr.labels || []).map(l => l.name); + for (const old of currentLabels) { + if (old.startsWith('level:') && old !== level) { + await github.rest.issues.removeLabel({ + ...repo, issue_number: prNumber, name: old, + }).catch(() => {}); + } + } + await github.rest.issues.addLabels({ + ...repo, issue_number: prNumber, labels: [level], + }); diff --git a/.github/workflows/gssoc-mentor.yml b/.github/workflows/gssoc-mentor.yml new file mode 100644 index 0000000..ff688f9 --- /dev/null +++ b/.github/workflows/gssoc-mentor.yml @@ -0,0 +1,65 @@ +# Copyright 2026 Optiqor contributors +# SPDX-License-Identifier: Apache-2.0 +# +# When a maintainer (repo collaborator with write+) submits an APPROVED +# review, apply `mentor:` to the PR. The scoring engine +# uses this label to credit the reviewing mentor with points. +# +# Multiple maintainers approving = multiple mentor:* labels. That is +# intentional: every reviewer who signed off gets credit. + +name: GSSoC mentor attribution + +on: + pull_request_review: + types: [submitted] + +permissions: + contents: read + pull-requests: write + +jobs: + attribute: + if: github.event.review.state == 'approved' + runs-on: ubuntu-latest + steps: + - uses: actions/github-script@v9 + with: + script: | + const repo = { owner: context.repo.owner, repo: context.repo.repo }; + const reviewer = context.payload.review.user.login; + const prNumber = context.payload.pull_request.number; + + // Only collaborators with write+ count as mentors. + let isMaintainer = false; + try { + const perm = await github.rest.repos.getCollaboratorPermissionLevel({ + ...repo, username: reviewer, + }); + const lvl = perm.data.permission; + isMaintainer = (lvl === 'admin' || lvl === 'write' || lvl === 'maintain'); + } catch (e) { + core.info(`@${reviewer} not a collaborator; skipping mentor attribution`); + return; + } + if (!isMaintainer) { + core.info(`@${reviewer} lacks write+; skipping`); + return; + } + + const label = `mentor:${reviewer}`; + + // Create the label on the fly so we don't need to pre-register + // every maintainer's username. Pastel gray so they don't shout. + await github.rest.issues.createLabel({ + ...repo, + name: label, + color: 'C5C5C5', + description: `Reviewed and approved by @${reviewer}`, + }).catch(() => {}); // label may already exist + + await github.rest.issues.addLabels({ + ...repo, issue_number: prNumber, labels: [label], + }); + + core.info(`applied ${label} to PR #${prNumber}`); From 161c902adb5fda3b2b5a581d8f05e5d08feff838 Mon Sep 17 00:00:00 2001 From: btwshivam Date: Sun, 17 May 2026 03:22:21 +0530 Subject: [PATCH 04/14] fix(github): grant issues:write to GSSoC label workflows Signed-off-by: btwshivam --- .github/workflows/gssoc-difficulty.yml | 1 + .github/workflows/gssoc-mentor.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/gssoc-difficulty.yml b/.github/workflows/gssoc-difficulty.yml index df8071f..954852d 100644 --- a/.github/workflows/gssoc-difficulty.yml +++ b/.github/workflows/gssoc-difficulty.yml @@ -33,6 +33,7 @@ on: permissions: contents: read + issues: write # /issues/:n/labels endpoint is under the issues resource pull-requests: write jobs: diff --git a/.github/workflows/gssoc-mentor.yml b/.github/workflows/gssoc-mentor.yml index ff688f9..1e046ce 100644 --- a/.github/workflows/gssoc-mentor.yml +++ b/.github/workflows/gssoc-mentor.yml @@ -16,6 +16,7 @@ on: permissions: contents: read + issues: write # /repos/:o/:r/labels and /issues/:n/labels live under the issues resource pull-requests: write jobs: From bdfe9a6091e8e860c8ddf290d12f9edc7cddd71f Mon Sep 17 00:00:00 2001 From: Shine Srivastava Date: Sun, 17 May 2026 03:43:30 +0530 Subject: [PATCH 05/14] feat(cli): add --no-banner flag to suppress ASCII header (#71) Adds a boolean flag to the doctor command to hide the ASCII art logo and metadata. This improves readability in CI/CD logs and automated environments. Includes updated PrettyRenderer logic and new unit tests. Signed-off-by: Shine Srivastava --- internal/cli/doctor.go | 7 ++++++- internal/cli/doctor_flags_test.go | 3 ++- internal/doctor/render.go | 9 +++++++-- internal/doctor/render_test.go | 21 +++++++++++++++++++++ 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go index 74ce50e..8a0e3c6 100644 --- a/internal/cli/doctor.go +++ b/internal/cli/doctor.go @@ -32,6 +32,7 @@ func newDoctorCmd() *cobra.Command { useAI bool noAI bool quiet bool + noBanner bool ) cmd := &cobra.Command{ @@ -80,6 +81,7 @@ Add --ai to enrich findings with AI-powered analysis (requires API key).`, output: output, aiEnabled: aiEnabled, quiet: quiet, + noBanner: noBanner, }) }, } @@ -93,6 +95,7 @@ Add --ai to enrich findings with AI-powered analysis (requires API key).`, flags.BoolVar(&useAI, "ai", false, "enable AI-powered analysis (requires API key)") flags.BoolVar(&noAI, "no-ai", false, "disable AI analysis even if enabled in config") flags.BoolVarP(&quiet, "quiet", "q", false, "only emit critical/warning findings (CI-friendly)") + flags.BoolVar(&noBanner, "no-banner", false, "suppress the ASCII banner block") return cmd } @@ -105,6 +108,7 @@ type doctorOpts struct { output string aiEnabled bool quiet bool + noBanner bool } func runDoctor(ctx context.Context, opts doctorOpts) error { @@ -146,7 +150,8 @@ func runDoctor(ctx context.Context, opts doctorOpts) error { renderer = &doctor.JSONRenderer{Pretty: true} default: renderer = &doctor.PrettyRenderer{ - NoColor: os.Getenv("NO_COLOR") != "" || !isTerminal(), + NoColor: os.Getenv("NO_COLOR") != "" || !isTerminal(), + NoBanner: opts.noBanner, } } diff --git a/internal/cli/doctor_flags_test.go b/internal/cli/doctor_flags_test.go index 5b556cf..fe5dd36 100644 --- a/internal/cli/doctor_flags_test.go +++ b/internal/cli/doctor_flags_test.go @@ -13,7 +13,7 @@ import ( func TestNewDoctorCmd_Flags(t *testing.T) { cmd := newDoctorCmd() - wantFlags := []string{"duration", "exit-code", "continuous", "interval", "output", "ai", "no-ai"} + wantFlags := []string{"duration", "exit-code", "continuous", "interval", "output", "ai", "no-ai", "no-banner"} for _, name := range wantFlags { if cmd.Flags().Lookup(name) == nil { t.Errorf("doctor cmd missing --%s flag", name) @@ -43,6 +43,7 @@ func TestNewDoctorCmd_Defaults(t *testing.T) { {"continuous", "false"}, {"ai", "false"}, {"no-ai", "false"}, + {"no-banner", "false"}, } for _, c := range cases { f := cmd.Flags().Lookup(c.flag) diff --git a/internal/doctor/render.go b/internal/doctor/render.go index afdb62b..5dcc394 100644 --- a/internal/doctor/render.go +++ b/internal/doctor/render.go @@ -22,7 +22,8 @@ type Renderer interface { // PrettyRenderer outputs a human-readable incident report with ANSI colors, // box-drawn finding cards, and bar-chart signal visualizations. type PrettyRenderer struct { - NoColor bool + NoColor bool + NoBanner bool } const ( @@ -67,7 +68,11 @@ func newPalette(noColor bool) palette { func (r *PrettyRenderer) Render(w io.Writer, report *Report) error { p := newPalette(r.NoColor) - r.renderHeader(w, report, p) + + if !r.NoBanner { + r.renderHeader(w, report, p) + } + r.renderDegradation(w, report, p) r.renderTriage(w, report, p) for i := range report.Findings { diff --git a/internal/doctor/render_test.go b/internal/doctor/render_test.go index 9d7678a..3140ed2 100644 --- a/internal/doctor/render_test.go +++ b/internal/doctor/render_test.go @@ -84,6 +84,27 @@ func TestPrettyRenderer_ContainsHeader(t *testing.T) { } } +func TestPrettyRenderer_NoBanner(t *testing.T) { + var buf bytes.Buffer + // Set NoBanner to true + r := &PrettyRenderer{NoColor: true, NoBanner: true} + report := sampleReport() + + if err := r.Render(&buf, report); err != nil { + t.Fatalf("Render failed: %v", err) + } + + output := buf.String() + + if strings.Contains(output, "KERNO DOCTOR") { + t.Error("pretty output should NOT contain KERNO DOCTOR banner when NoBanner is true") + } + + if !strings.Contains(output, "FINDINGS") { + t.Error("pretty output should still contain FINDINGS") + } +} + func TestPrettyRenderer_HealthySystem(t *testing.T) { var buf bytes.Buffer r := &PrettyRenderer{NoColor: true} From 7a08799a1e2ffed81874c6969d042d33d8044ed6 Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sun, 17 May 2026 08:03:44 +0530 Subject: [PATCH 06/14] feat: implement Helm chart packaging and automated release workflow via GitHub Actions --- .github/workflows/helm-release.yml | 11 +- README.md | 1326 ++++++++++++++-------------- artifacthub-repo.yml | 6 +- deploy/helm/kerno/Chart.yaml | 50 +- 4 files changed, 693 insertions(+), 700 deletions(-) diff --git a/.github/workflows/helm-release.yml b/.github/workflows/helm-release.yml index d1119ef..91b973b 100644 --- a/.github/workflows/helm-release.yml +++ b/.github/workflows/helm-release.yml @@ -40,10 +40,9 @@ jobs: # We need to ensure artifacthub-repo.yml is in the root of gh-pages. - name: Push Artifact Hub metadata run: | + cp artifacthub-repo.yml /tmp/ahr.yml git checkout gh-pages - cp .github/artifacthub-repo.yml ./artifacthub-repo.yml || cp artifacthub-repo.yml ./artifacthub-repo.yml || true - if [ -f artifacthub-repo.yml ]; then - git add artifacthub-repo.yml - git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" - git push origin gh-pages - fi + cp /tmp/ahr.yml ./artifacthub-repo.yml + git add artifacthub-repo.yml + git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" + git push origin gh-pages diff --git a/README.md b/README.md index 186813f..c0e5f82 100644 --- a/README.md +++ b/README.md @@ -1,663 +1,663 @@ -
- -# KERNO - -### The production incident diagnosis engine for Kubernetes - -**Your cluster broke. Your dashboards are green. Users are paging.** -**Run `kerno doctor`. 30 seconds. Root cause. Plain English.** - -Same single binary runs on bare metal, VMs, EC2, GCE - wherever Linux lives. - -[![CI](https://github.com/optiqor/kerno/actions/workflows/ci.yml/badge.svg)](https://github.com/optiqor/kerno/actions/workflows/ci.yml) -[![Go Report Card](https://goreportcard.com/badge/github.com/optiqor/kerno)](https://goreportcard.com/report/github.com/optiqor/kerno) -[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE) -[![Release](https://img.shields.io/github/v/release/optiqor/kerno?include_prereleases)](https://github.com/optiqor/kerno/releases) -[![GHCR](https://img.shields.io/badge/ghcr.io-optiqor%2Fkerno-blue?logo=docker)](https://github.com/optiqor/kerno/pkgs/container/kerno) -![Go Version](https://img.shields.io/github/go-mod/go-version/optiqor/kerno) - -[**Quick Start**](#quick-start) · [**How It Works**](#how-it-works) · [**Features**](#features) · [**Kubernetes**](#kubernetes-deployment) · [**Docs**](docs/architecture.md) - -kerno doctor demo - -
- ---- - -## What is Kerno? - -Kerno is a **Kubernetes-native incident diagnosis engine** built on eBPF. -It runs as a DaemonSet on every node, watches the kernel - not your app - and answers a single question on demand: - -> *Why is production broken right now?* - -```bash -kubectl -n kerno-system exec ds/kerno -- kerno doctor -``` - -30 seconds later you get a ranked diagnostic report with **plain-English causes, evidence, ETAs, and copy-paste fix steps** - no dashboards to wire, no query language to learn, no agents in your app. - -The kernel knows minutes before your APM. Hours before your users. Kerno makes that visible. - -**Same binary outside Kubernetes too.** `curl | bash` it onto any bare-metal box, EC2 instance, or systemd VM and `sudo kerno doctor` works exactly the same. - -## Why Kerno? - -It's 3am. PagerDuty fires. Latency is up, error budget is burning, and every dashboard you own is **green**. - -- Prometheus says CPU and memory look fine. -- Datadog APM says your app is healthy. -- The Grafana panels your SRE spent a weekend building - all green. - -**That's because every tool you have watches your _application_. Nothing is watching the kernel.** - -```mermaid -flowchart TB - subgraph Stack["YOUR K8S STACK"] - App["Workload Pods
(Node, Python, Go, Java)"] - Runtime["Container Runtime
(containerd, CRI-O)"] - OS["Node Kernel
(Linux)"] - HW["Nodes / EC2 / GCE"] - end - - subgraph Tools["WHO WATCHES WHAT"] - APM["Datadog · New Relic
Prometheus · Grafana"] - CRun["Pixie · Tetragon
Inspektor Gadget"] - Kerno["KERNO
eBPF kernel tracing"] - Bare["(nobody)"] - end - - App -.watched by.-> APM - Runtime -.watched by.-> CRun - OS -.watched by.-> Kerno - HW -.watched by.-> Bare - - style Kerno fill:#e94560,stroke:#fff,color:#fff,stroke-width:3px - style App fill:#0f3460,stroke:#16213e,color:#fff - style Runtime fill:#16213e,stroke:#533483,color:#fff - style OS fill:#1a1a2e,stroke:#e94560,color:#fff - style HW fill:#533483,stroke:#16213e,color:#fff - style APM fill:#16213e,stroke:#0f3460,color:#ccc - style CRun fill:#16213e,stroke:#0f3460,color:#ccc - style Bare fill:#16213e,stroke:#0f3460,color:#888 -``` - -The kernel is where the pain actually lives - disk throttling, TCP retransmits, OOM kills, scheduler contention, FD leaks. The kernel knows minutes before your dashboards. Hours before your users. - -Kerno runs as a DaemonSet on every node, streams kernel signals through eBPF with microsecond overhead, and turns them into a diagnostic report that reads like a doctor's note. - -```bash -kubectl -n kerno-system exec ds/kerno -- kerno doctor -``` - -One command. 30 seconds later, you get the report shown in the [demo above](#kerno) - ranked findings, plain-English causes, evidence, and copy-paste fix steps. - -That's the entire debugging loop - from page to root cause - in a single command. - ---- - -## How Kerno compares - -| | Watches | K8s-Native | Incident Report | SLO Mapping | AI Analysis | Install Time | -|---|:---:|:---:|:---:|:---:|:---:|:---:| -| Prometheus + Grafana | Application | Partial | No | No | No | Hours | -| Datadog APM | Application | Partial | No | Partial | Yes | Hours | -| Cilium Tetragon | Security | **Yes** | No | No | No | Minutes | -| Inspektor Gadget | Container | **Yes** | No | No | No | Minutes | -| Pixie | Application | **Yes** | No | No | No | Minutes | -| **Kerno** | **Kernel** | **Yes** | **Yes** | **Yes** | **Yes** | **< 1 min** | - -Kerno is the only eBPF tool in the Kubernetes ecosystem that produces a ranked, human-readable **incident report** - not a firehose of events, not another dashboard, not a query language to learn. - ---- - -## Quick Start - -> **Requires:** kernel **5.8+** with BTF (every major managed K8s qualifies: EKS, GKE, AKS, DOKS, Linode, Civo). For raw manifests/Helm you'll need cluster-admin. - -### 1 · Kubernetes (primary) - -```bash -helm repo add kerno https://optiqor.github.io/kerno -helm repo update -helm install kerno kerno/kerno \ - -n kerno-system --create-namespace -``` - -Within 30 seconds Kerno is running as a DaemonSet on every node, watching the kernel via eBPF, exposing `/metrics` for Prometheus, and ready for `kerno doctor`. - -> **Tip:** If you prefer to install from a local clone: `helm install kerno ./deploy/helm/kerno -n kerno-system --create-namespace` - -```bash -# Cluster-wide incident report - 30 seconds of real kernel data -kubectl -n kerno-system exec ds/kerno -- kerno doctor - -# CI-friendly: machine-readable JSON, exits non-zero on critical findings -kubectl -n kerno-system exec ds/kerno -- kerno doctor --output json --exit-code - -# AI-enriched root cause analysis (set the API key once) -kubectl -n kerno-system set env ds/kerno KERNO_AI_API_KEY=sk-... -kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai -``` - -ServiceMonitor for the Prometheus Operator is built-in. Raw manifests live at [`deploy/k8s/`](deploy/k8s/) if you don't use Helm. - ---- - -### 2 · Bare metal · VMs · EC2 · GCE - -The same binary, the same command. No Kubernetes required. - -```bash -curl -sfL https://raw.githubusercontent.com/optiqor/kerno/main/scripts/install.sh | sudo bash -sudo kerno doctor -``` - -Long-lived systemd service with `/metrics` for Prometheus: - -```bash -curl -sfL https://raw.githubusercontent.com/optiqor/kerno/main/scripts/install.sh | sudo bash -s -- --daemon -journalctl -u kerno -f -``` - -### 3 · Docker (ad-hoc, any host with a privileged daemon) - -```bash -docker run --rm --privileged --pid=host \ - -v /sys/kernel/debug:/sys/kernel/debug:ro \ - -v /sys/kernel/btf:/sys/kernel/btf:ro \ - -v /sys/fs/bpf:/sys/fs/bpf \ - -v /proc:/proc:ro \ - ghcr.io/optiqor/kerno:latest doctor -``` - -Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. Graviton, Apple Silicon, and Raspberry Pi clusters work out of the box. - ---- - -## Kubernetes Deployment - -Kerno is designed from day one to run as a Kubernetes DaemonSet. One pod per node, one eBPF agent per kernel, zero API server load. - -```mermaid -flowchart TB - subgraph Cluster["Kubernetes Cluster"] - direction TB - subgraph Node1["Worker Node 1"] - K1["Kerno Pod
DaemonSet"] - W1["Workload Pods"] - end - subgraph Node2["Worker Node 2"] - K2["Kerno Pod
DaemonSet"] - W2["Workload Pods"] - end - subgraph Node3["Worker Node N"] - K3["Kerno Pod
DaemonSet"] - W3["Workload Pods"] - end - end - - K1 -->|:9090/metrics| Prom["Prometheus"] - K2 -->|:9090/metrics| Prom - K3 -->|:9090/metrics| Prom - Prom --> GF["Grafana"] - - K1 -.enriches.-> W1 - K2 -.enriches.-> W2 - K3 -.enriches.-> W3 - - style K1 fill:#e94560,stroke:#fff,color:#fff - style K2 fill:#e94560,stroke:#fff,color:#fff - style K3 fill:#e94560,stroke:#fff,color:#fff - style Prom fill:#0f3460,stroke:#fff,color:#fff - style GF fill:#16213e,stroke:#fff,color:#fff - style W1 fill:#533483,stroke:#fff,color:#fff - style W2 fill:#533483,stroke:#fff,color:#fff - style W3 fill:#533483,stroke:#fff,color:#fff -``` - -### Pod enrichment - no API server load - -Kerno tags every finding with pod, namespace, node, and workload labels. No `client-go` informers, no watch connections - Kerno reads `/var/lib/kubelet/pods` directly, so even a failing API server doesn't blind the agent. Exactly when you need it most. - -### Host mounts - the minimum necessary - -| Mount | Why | -|---|---| -| `/sys/kernel/debug` | tracepoints, kprobes | -| `/sys/kernel/btf` | CO-RE type resolution | -| `/sys/fs/bpf` | BPF map pinning | -| `/proc` | PID → cgroup → pod resolution | -| `/sys/fs/cgroup` | container resource accounting | -| `/sys/class/net` | per-interface TCP counters | -| `/sys/block` | per-device disk stats | - -### Security posture - -- Runs with the **minimum capabilities needed** - `CAP_BPF`, `CAP_PERFMON`, `CAP_SYS_PTRACE`, `CAP_NET_ADMIN`, `CAP_DAC_READ_SEARCH` (not `CAP_SYS_ADMIN` for the hot path). -- Read-only root filesystem, `ProtectSystem=strict` via systemd on bare metal. -- No outbound network calls. AI integration is opt-in and goes through your configured provider only. - -### Helm values - -```yaml -image: - repository: ghcr.io/optiqor/kerno - tag: v0.1.0 - -resources: - requests: { cpu: 100m, memory: 128Mi } - limits: { cpu: "1", memory: 512Mi } - -prometheus: - enabled: true - port: 9090 - -serviceMonitor: # Prometheus Operator - enabled: true - interval: 15s - -nodeSelector: - monitoring: "true" -``` - -### Verify - -```bash -kubectl -n kerno-system get ds kerno -kubectl -n kerno-system logs -l app.kubernetes.io/name=kerno -kubectl -n kerno-system exec ds/kerno -- kerno doctor -``` - ---- - -## Features - - - - - - -
- -### Incident Diagnosis - -- **`kerno doctor`** - 30-second cluster-wide diagnostic, ranked findings, fix suggestions -- **`kerno explain`** - AI-powered kernel error explanation (no root needed) -- **`kerno predict`** - surface failures before they page you - -### Real-Time Tracing - -- **`kerno trace syscall`** - per-pod syscall latency streaming -- **`kerno trace disk`** - block I/O latency by device, op, process -- **`kerno trace sched`** - CPU scheduler run queue delays - - - -### Continuous Monitoring - -- **`kerno watch tcp`** - TCP connections, RTT, retransmits -- **`kerno watch oom`** - OOM kill alerts with pod context -- **`kerno watch fd`** - FD leak detection via growth rate -- **`kerno start`** - daemon mode with Prometheus metrics - -### Integrations - -- **Prometheus** - 16 metrics at `/metrics`, ServiceMonitor support -- **Kubernetes** - Helm chart + pod enrichment (no API server load) -- **AI Providers** - Anthropic, OpenAI, Ollama (optional, opt-in) -- **Systemd** - unit/slice enrichment on bare metal - -
- ---- - -## How It Works - -Kerno runs as a lightweight Go agent with six tiny eBPF programs attached to stable tracepoints. When `kerno doctor` runs, it collects 30 seconds of real kernel data, evaluates 11 diagnostic rules deterministically, and emits a ranked incident report. No sampling. No guesswork. No query language. - -### Architecture - -```mermaid -flowchart TB - subgraph Kernel["KERNEL SPACE · eBPF Programs"] - direction LR - P1["syscall
latency"] - P2["tcp
monitor"] - P3["oom
track"] - P4["disk
io"] - P5["sched
delay"] - P6["fd
track"] - end - - RB[("Ring Buffers
256KB per program
zero-copy mmap")] - - subgraph UserSpace["USER SPACE · Go"] - direction TB - Loader["BPF Loaders
cilium/ebpf"] - Collector["Collectors
percentile aggregation"] - Signals[("Signals Snapshot
single source of truth")] - Adapter["Environment Adapter
k8s · systemd · bare metal"] - end - - subgraph Outputs["OUTPUTS"] - direction TB - Doctor["Doctor Engine
11 diagnostic rules"] - AI["AI Layer (optional)
root cause analysis"] - Prom["Prometheus
/metrics :9090"] - CLI["Terminal
pretty · JSON"] - end - - P1 & P2 & P3 & P4 & P5 & P6 --> RB - RB --> Loader - Loader --> Collector - Collector --> Signals - Adapter -.enriches.-> Signals - Signals --> Doctor - Signals --> Prom - Doctor --> AI - AI --> CLI - Doctor --> CLI - - classDef kernel fill:#1a1a2e,stroke:#e94560,color:#fff,stroke-width:2px - classDef user fill:#0f3460,stroke:#16213e,color:#fff,stroke-width:2px - classDef output fill:#16213e,stroke:#533483,color:#fff,stroke-width:2px - classDef buffer fill:#533483,stroke:#e94560,color:#fff,stroke-width:3px - classDef ai fill:#e94560,stroke:#fff,color:#fff,stroke-width:2px - - class P1,P2,P3,P4,P5,P6 kernel - class Loader,Collector,Signals,Adapter user - class Doctor,Prom,CLI output - class RB buffer - class AI ai -``` - -### Core principles - -1. **Deterministic first.** The rule engine is pure Go, testable, and runs whether AI is on or off. Every finding has a clear cause, threshold, and fix. -2. **Zero-copy hot path.** Kernel events land in eBPF ring buffers and are drained via `mmap` - microsecond overhead, no serialization cost. -3. **No API server load.** Pod enrichment reads the kubelet's local pod manifests. The agent survives API server outages - the moment you need it most. -4. **AI is a post-processor.** Optional. Opt-in. Never touches the hot path. The deterministic engine always runs; AI enriches, it never replaces. -5. **Graceful degradation.** If an eBPF program fails to load on a weird kernel, that collector is skipped with a clear warning. The rest keep working. - -### Data flow - -```mermaid -sequenceDiagram - participant K as Kernel
(eBPF) - participant R as Ring Buffer - participant C as Collectors - participant D as Doctor Engine - participant A as AI Layer - participant U as On-call Engineer - - K->>R: syscall/tcp/oom/io events - Note over K,R: Zero-copy, microsecond overhead - R->>C: drain events - C->>C: aggregate into p50/p95/p99 - C->>D: Signals snapshot - D->>D: evaluate 11 rules - alt AI enabled - D->>A: findings + signals - A->>A: correlate + explain - A->>U: incident report + root cause - else AI disabled - D->>U: deterministic incident report - end -``` - ---- - -## The Diagnostic Rules - -Kerno runs 11 deterministic rules against every snapshot. Every rule is explainable, configurable, and covered by tests. - -| # | Rule | Triggers When | Severity | -|---|------|---------------|:---:| -| 1 | Disk I/O Bottleneck | fsync p99 > 50ms or write p99 > 200ms | WARN / CRIT | -| 2 | OOM Kill Occurred | Any OOM event in window | CRIT | -| 3 | TCP Retransmit Storm | Retransmit rate > 2% | CRIT | -| 4 | TCP RTT Degradation | RTT p99 > 10ms | WARN | -| 5 | Scheduler Contention | Runqueue delay p99 > 5ms | WARN / CRIT | -| 6 | FD Leak | FD growth > 10/sec sustained | WARN (with ETA) | -| 7 | Syscall Latency High | Any syscall p99 > 100ms | WARN / CRIT | -| 8 | OOM Imminent | Memory > 90% + positive growth | WARN / CRIT (with ETA) | -| 9 | Syscall Error Rate | Error rate > 1% per syscall | WARN / CRIT | -| 10 | Memory Pressure | RSS usage > 90% | WARN | -| 11 | Network Latency | Connection RTT > 100ms | WARN | - ---- - -## Usage - -### Incident diagnosis - "what broke just now?" - -```bash -# The golden command -kubectl -n kerno-system exec ds/kerno -- kerno doctor - -# Quick 10-second check -kubectl -n kerno-system exec ds/kerno -- kerno doctor --duration 10s - -# JSON for CI/CD, runbooks, Slack bots (non-zero exit on critical) -kubectl -n kerno-system exec ds/kerno -- kerno doctor --output json --exit-code - -# AI-powered root cause analysis -kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai - -# Explain a kernel error (no root, no cluster needed) -kerno explain "BUG: kernel NULL pointer dereference" -dmesg | tail -5 | kerno explain - -# Predict failures before they page you -kubectl -n kerno-system exec ds/kerno -- kerno predict --snapshots 5 --interval 15s -``` - -### Real-time tracing - "watch it happen" - -```bash -# Every syscall event streaming -kubectl -n kerno-system exec ds/kerno -- kerno trace syscall - -# Only syscalls from a specific pod's PID -kubectl -n kerno-system exec ds/kerno -- kerno trace syscall --pid 1234 - -# Postgres disk writes over 5ms -kubectl -n kerno-system exec ds/kerno -- kerno trace disk --process postgres --op write --threshold 5ms - -# Scheduler delays over 10ms -kubectl -n kerno-system exec ds/kerno -- kerno trace sched --threshold 10ms -``` - -### Continuous monitoring - "alert me when…" - -```bash -# TCP connections with retransmits -kubectl -n kerno-system exec ds/kerno -- kerno watch tcp --retransmits - -# Any OOM kill, with pod context -kubectl -n kerno-system exec ds/kerno -- kerno watch oom --alert - -# Processes leaking FDs -kubectl -n kerno-system exec ds/kerno -- kerno watch fd --threshold 10 -``` - ---- - -## Prometheus Metrics - -The DaemonSet exposes 16 metrics at `:9090/metrics`. ServiceMonitor is included when the Prometheus Operator is installed. - -
-View all 16 metrics - -| Metric | Type | What It Measures | -|---|:---:|---| -| `kerno_syscall_duration_nanoseconds` | Summary | Syscall latency (p50, p95, p99) | -| `kerno_syscall_total` | Counter | Total syscall events | -| `kerno_tcp_rtt_nanoseconds` | Summary | TCP round-trip time | -| `kerno_tcp_retransmits_total` | Counter | TCP retransmissions | -| `kerno_tcp_connections_total` | Counter | TCP connection events | -| `kerno_oom_kills_total` | Counter | OOM kill events | -| `kerno_disk_io_duration_nanoseconds` | Summary | Disk I/O latency | -| `kerno_disk_io_bytes_total` | Counter | Disk I/O bytes | -| `kerno_sched_delay_nanoseconds` | Summary | CPU run queue delay | -| `kerno_fd_open_total` | Counter | FD open operations | -| `kerno_fd_close_total` | Counter | FD close operations | -| `kerno_collector_events_total` | Counter | Events per collector | -| `kerno_collector_errors_total` | Counter | Errors per collector | -| `kerno_bpf_programs_loaded` | Gauge | Loaded eBPF programs | -| `kerno_info` | Gauge | Build version | - -Health endpoints: `/healthz` and `/readyz` return JSON status. - -
- ---- - -## Environment & AI - -**Environment auto-detection.** Kerno picks one of three adapters and enriches every event - no configuration required: - -- **Kubernetes** (in-cluster token present) → pod, namespace, node, deployment -- **Systemd** (PID 1 is systemd) → unit, slice, scope -- **Bare metal** → hostname, cgroup path - -**AI (optional).** The AI layer runs **after** the deterministic rule engine - it correlates cross-signals and explains root causes, it never replaces rules. Three providers (**Anthropic**, **OpenAI**, **Ollama** for air-gapped), three privacy modes (`full` / `redacted` / `summary`), TTL cache + token-bucket rate limiting, graceful fallback to a deterministic template on failure. No LLM SDK dependencies - pure `net/http`. - -```bash -kubectl -n kerno-system set env ds/kerno \ - KERNO_AI_API_KEY=sk-... \ - KERNO_AI_PROVIDER=anthropic -kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai -``` - ---- - -## Configuration - -Kerno works with **zero configuration**. For custom setups, mount a `config.yaml` or use `KERNO_*` env vars: - -```yaml -log_level: info - -collectors: - syscall_latency: true - tcp_monitor: true - oom_track: true - disk_io: true - sched_delay: true - fd_track: true - -doctor: - duration: 30s - thresholds: - syscall_p99_warning_ns: 100000000 # 100ms - syscall_p99_critical_ns: 500000000 # 500ms - tcp_retransmit_pct: 2.0 # 2% - oom_memory_pct: 90.0 # 90% - disk_p99_warning_ns: 50000000 # 50ms - disk_p99_critical_ns: 200000000 # 200ms - sched_delay_warning_ns: 5000000 # 5ms - sched_delay_critical_ns: 20000000 # 20ms - fd_growth_per_sec: 10.0 - -prometheus: - enabled: true - addr: ":9090" - -ai: - enabled: false - provider: anthropic - privacy_mode: summary -``` - -**Precedence:** CLI flags > environment variables (`KERNO_*`) > config file > defaults. - ---- - -## Roadmap - -See [TODO.md](TODO.md) for the full plan. Headlines: - -- **v0.1** - DaemonSet, 6 eBPF collectors, 11 rules, Prometheus, AI post-processor, 7 chaos scenarios, 13-phase verify pipeline - **shipped, all gates green on kernel 6.17** -- **v0.2** - CRD for cluster-wide incident policies, OpenTelemetry OTLP export, Grafana dashboards, sliding-window aggregation -- **v0.3** - historical incident replay, SLO-linked alerts, Slack / PagerDuty integrations -- **v1.0** - multi-cluster control plane, managed offering (Optiqor Cloud) - ---- - -## Building from Source - -```bash -# Requirements: Go 1.25+ -# Optional for real eBPF: clang 14+, libbpf-dev, llvm, bpftool - -make build # Build binary (uses BPF stubs - no clang needed) -make generate # Run bpf2go to produce *_bpfel.go from C sources -make bpf # Compile eBPF C programs to .o -make bpf-verify # Build the standalone kernel-verifier load harness -make test # Run unit tests -make test-race # Run with race detector -make lint # golangci-lint -make check # vet + test + lint -make verify # Comprehensive 13-phase production-readiness check -make demo # Record demo.gif via vhs (needs vhs + ttyd + ffmpeg) -make demo-cast # Record demo.cast via asciinema (alternative to vhs) -make docker # Build Docker image -``` - -**Reproducing the verifier proof end-to-end:** - -```bash -# Install eBPF toolchain -sudo apt-get install -y clang llvm libbpf-dev linux-tools-$(uname -r) jq - -# Build, generate, verify everything in one shot -make verify # exits 0 only if all 62 checks pass -``` - -**Inducing real incidents to demo or test rule firing:** - -```bash -sudo tc qdisc add dev lo root netem loss 30% # optional, for tcp-loss -kerno chaos --induce --intensity high --duration 30s - -# Available scenarios (kerno chaos --list): -# cpu scheduler_contention -# disk-sat disk_io_bottleneck -# fd-leak fd_leak -# memory oom_imminent -# tcp-churn scheduler_contention -# tcp-loss tcp_retransmit_storm -# cascade multiple -``` - -In another shell, `sudo kerno doctor` will catch the induced incident. - ---- - -## Contributing - -Contributions welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for: - -- Development setup and prerequisites -- Commit message conventions (Conventional Commits) -- Code review process -- DCO sign-off requirement - -For security reports, see [SECURITY.md](SECURITY.md). - ---- - -## License - -Apache License 2.0 - see [LICENSE](LICENSE). - -
- ---- - -If Kerno saved your on-call shift, consider leaving a **⭐** it helps other engineers find the project. - -
+
+ +# KERNO + +### The production incident diagnosis engine for Kubernetes + +**Your cluster broke. Your dashboards are green. Users are paging.** +**Run `kerno doctor`. 30 seconds. Root cause. Plain English.** + +Same single binary runs on bare metal, VMs, EC2, GCE - wherever Linux lives. + +[![CI](https://github.com/optiqor/kerno/actions/workflows/ci.yml/badge.svg)](https://github.com/optiqor/kerno/actions/workflows/ci.yml) +[![Go Report Card](https://goreportcard.com/badge/github.com/optiqor/kerno)](https://goreportcard.com/report/github.com/optiqor/kerno) +[![License: Apache 2.0](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](LICENSE) +[![Release](https://img.shields.io/github/v/release/optiqor/kerno?include_prereleases)](https://github.com/optiqor/kerno/releases) +[![GHCR](https://img.shields.io/badge/ghcr.io-optiqor%2Fkerno-blue?logo=docker)](https://github.com/optiqor/kerno/pkgs/container/kerno) +![Go Version](https://img.shields.io/github/go-mod/go-version/optiqor/kerno) + +[**Quick Start**](#quick-start) · [**How It Works**](#how-it-works) · [**Features**](#features) · [**Kubernetes**](#kubernetes-deployment) · [**Docs**](docs/architecture.md) + +kerno doctor demo + +
+ +--- + +## What is Kerno? + +Kerno is a **Kubernetes-native incident diagnosis engine** built on eBPF. +It runs as a DaemonSet on every node, watches the kernel - not your app - and answers a single question on demand: + +> *Why is production broken right now?* + +```bash +kubectl -n kerno-system exec ds/kerno -- kerno doctor +``` + +30 seconds later you get a ranked diagnostic report with **plain-English causes, evidence, ETAs, and copy-paste fix steps** - no dashboards to wire, no query language to learn, no agents in your app. + +The kernel knows minutes before your APM. Hours before your users. Kerno makes that visible. + +**Same binary outside Kubernetes too.** `curl | bash` it onto any bare-metal box, EC2 instance, or systemd VM and `sudo kerno doctor` works exactly the same. + +## Why Kerno? + +It's 3am. PagerDuty fires. Latency is up, error budget is burning, and every dashboard you own is **green**. + +- Prometheus says CPU and memory look fine. +- Datadog APM says your app is healthy. +- The Grafana panels your SRE spent a weekend building - all green. + +**That's because every tool you have watches your _application_. Nothing is watching the kernel.** + +```mermaid +flowchart TB + subgraph Stack["YOUR K8S STACK"] + App["Workload Pods
(Node, Python, Go, Java)"] + Runtime["Container Runtime
(containerd, CRI-O)"] + OS["Node Kernel
(Linux)"] + HW["Nodes / EC2 / GCE"] + end + + subgraph Tools["WHO WATCHES WHAT"] + APM["Datadog · New Relic
Prometheus · Grafana"] + CRun["Pixie · Tetragon
Inspektor Gadget"] + Kerno["KERNO
eBPF kernel tracing"] + Bare["(nobody)"] + end + + App -.watched by.-> APM + Runtime -.watched by.-> CRun + OS -.watched by.-> Kerno + HW -.watched by.-> Bare + + style Kerno fill:#e94560,stroke:#fff,color:#fff,stroke-width:3px + style App fill:#0f3460,stroke:#16213e,color:#fff + style Runtime fill:#16213e,stroke:#533483,color:#fff + style OS fill:#1a1a2e,stroke:#e94560,color:#fff + style HW fill:#533483,stroke:#16213e,color:#fff + style APM fill:#16213e,stroke:#0f3460,color:#ccc + style CRun fill:#16213e,stroke:#0f3460,color:#ccc + style Bare fill:#16213e,stroke:#0f3460,color:#888 +``` + +The kernel is where the pain actually lives - disk throttling, TCP retransmits, OOM kills, scheduler contention, FD leaks. The kernel knows minutes before your dashboards. Hours before your users. + +Kerno runs as a DaemonSet on every node, streams kernel signals through eBPF with microsecond overhead, and turns them into a diagnostic report that reads like a doctor's note. + +```bash +kubectl -n kerno-system exec ds/kerno -- kerno doctor +``` + +One command. 30 seconds later, you get the report shown in the [demo above](#kerno) - ranked findings, plain-English causes, evidence, and copy-paste fix steps. + +That's the entire debugging loop - from page to root cause - in a single command. + +--- + +## How Kerno compares + +| | Watches | K8s-Native | Incident Report | SLO Mapping | AI Analysis | Install Time | +|---|:---:|:---:|:---:|:---:|:---:|:---:| +| Prometheus + Grafana | Application | Partial | No | No | No | Hours | +| Datadog APM | Application | Partial | No | Partial | Yes | Hours | +| Cilium Tetragon | Security | **Yes** | No | No | No | Minutes | +| Inspektor Gadget | Container | **Yes** | No | No | No | Minutes | +| Pixie | Application | **Yes** | No | No | No | Minutes | +| **Kerno** | **Kernel** | **Yes** | **Yes** | **Yes** | **Yes** | **< 1 min** | + +Kerno is the only eBPF tool in the Kubernetes ecosystem that produces a ranked, human-readable **incident report** - not a firehose of events, not another dashboard, not a query language to learn. + +--- + +## Quick Start + +> **Requires:** kernel **5.8+** with BTF (every major managed K8s qualifies: EKS, GKE, AKS, DOKS, Linode, Civo). For raw manifests/Helm you'll need cluster-admin. + +### 1 · Kubernetes (primary) + +```bash +helm repo add kerno https://optiqor.github.io/kerno-charts +helm repo update +helm install kerno kerno/kerno \ + -n kerno-system --create-namespace +``` + +Within 30 seconds Kerno is running as a DaemonSet on every node, watching the kernel via eBPF, exposing `/metrics` for Prometheus, and ready for `kerno doctor`. + +> **Tip:** If you prefer to install from a local clone: `helm install kerno ./deploy/helm/kerno -n kerno-system --create-namespace` + +```bash +# Cluster-wide incident report - 30 seconds of real kernel data +kubectl -n kerno-system exec ds/kerno -- kerno doctor + +# CI-friendly: machine-readable JSON, exits non-zero on critical findings +kubectl -n kerno-system exec ds/kerno -- kerno doctor --output json --exit-code + +# AI-enriched root cause analysis (set the API key once) +kubectl -n kerno-system set env ds/kerno KERNO_AI_API_KEY=sk-... +kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai +``` + +ServiceMonitor for the Prometheus Operator is built-in. Raw manifests live at [`deploy/k8s/`](deploy/k8s/) if you don't use Helm. + +--- + +### 2 · Bare metal · VMs · EC2 · GCE + +The same binary, the same command. No Kubernetes required. + +```bash +curl -sfL https://raw.githubusercontent.com/optiqor/kerno/main/scripts/install.sh | sudo bash +sudo kerno doctor +``` + +Long-lived systemd service with `/metrics` for Prometheus: + +```bash +curl -sfL https://raw.githubusercontent.com/optiqor/kerno/main/scripts/install.sh | sudo bash -s -- --daemon +journalctl -u kerno -f +``` + +### 3 · Docker (ad-hoc, any host with a privileged daemon) + +```bash +docker run --rm --privileged --pid=host \ + -v /sys/kernel/debug:/sys/kernel/debug:ro \ + -v /sys/kernel/btf:/sys/kernel/btf:ro \ + -v /sys/fs/bpf:/sys/fs/bpf \ + -v /proc:/proc:ro \ + ghcr.io/optiqor/kerno:latest doctor +``` + +Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. Graviton, Apple Silicon, and Raspberry Pi clusters work out of the box. + +--- + +## Kubernetes Deployment + +Kerno is designed from day one to run as a Kubernetes DaemonSet. One pod per node, one eBPF agent per kernel, zero API server load. + +```mermaid +flowchart TB + subgraph Cluster["Kubernetes Cluster"] + direction TB + subgraph Node1["Worker Node 1"] + K1["Kerno Pod
DaemonSet"] + W1["Workload Pods"] + end + subgraph Node2["Worker Node 2"] + K2["Kerno Pod
DaemonSet"] + W2["Workload Pods"] + end + subgraph Node3["Worker Node N"] + K3["Kerno Pod
DaemonSet"] + W3["Workload Pods"] + end + end + + K1 -->|:9090/metrics| Prom["Prometheus"] + K2 -->|:9090/metrics| Prom + K3 -->|:9090/metrics| Prom + Prom --> GF["Grafana"] + + K1 -.enriches.-> W1 + K2 -.enriches.-> W2 + K3 -.enriches.-> W3 + + style K1 fill:#e94560,stroke:#fff,color:#fff + style K2 fill:#e94560,stroke:#fff,color:#fff + style K3 fill:#e94560,stroke:#fff,color:#fff + style Prom fill:#0f3460,stroke:#fff,color:#fff + style GF fill:#16213e,stroke:#fff,color:#fff + style W1 fill:#533483,stroke:#fff,color:#fff + style W2 fill:#533483,stroke:#fff,color:#fff + style W3 fill:#533483,stroke:#fff,color:#fff +``` + +### Pod enrichment - no API server load + +Kerno tags every finding with pod, namespace, node, and workload labels. No `client-go` informers, no watch connections - Kerno reads `/var/lib/kubelet/pods` directly, so even a failing API server doesn't blind the agent. Exactly when you need it most. + +### Host mounts - the minimum necessary + +| Mount | Why | +|---|---| +| `/sys/kernel/debug` | tracepoints, kprobes | +| `/sys/kernel/btf` | CO-RE type resolution | +| `/sys/fs/bpf` | BPF map pinning | +| `/proc` | PID → cgroup → pod resolution | +| `/sys/fs/cgroup` | container resource accounting | +| `/sys/class/net` | per-interface TCP counters | +| `/sys/block` | per-device disk stats | + +### Security posture + +- Runs with the **minimum capabilities needed** - `CAP_BPF`, `CAP_PERFMON`, `CAP_SYS_PTRACE`, `CAP_NET_ADMIN`, `CAP_DAC_READ_SEARCH` (not `CAP_SYS_ADMIN` for the hot path). +- Read-only root filesystem, `ProtectSystem=strict` via systemd on bare metal. +- No outbound network calls. AI integration is opt-in and goes through your configured provider only. + +### Helm values + +```yaml +image: + repository: ghcr.io/optiqor/kerno + tag: v0.1.0 + +resources: + requests: { cpu: 100m, memory: 128Mi } + limits: { cpu: "1", memory: 512Mi } + +prometheus: + enabled: true + port: 9090 + +serviceMonitor: # Prometheus Operator + enabled: true + interval: 15s + +nodeSelector: + monitoring: "true" +``` + +### Verify + +```bash +kubectl -n kerno-system get ds kerno +kubectl -n kerno-system logs -l app.kubernetes.io/name=kerno +kubectl -n kerno-system exec ds/kerno -- kerno doctor +``` + +--- + +## Features + + + + + + +
+ +### Incident Diagnosis + +- **`kerno doctor`** - 30-second cluster-wide diagnostic, ranked findings, fix suggestions +- **`kerno explain`** - AI-powered kernel error explanation (no root needed) +- **`kerno predict`** - surface failures before they page you + +### Real-Time Tracing + +- **`kerno trace syscall`** - per-pod syscall latency streaming +- **`kerno trace disk`** - block I/O latency by device, op, process +- **`kerno trace sched`** - CPU scheduler run queue delays + + + +### Continuous Monitoring + +- **`kerno watch tcp`** - TCP connections, RTT, retransmits +- **`kerno watch oom`** - OOM kill alerts with pod context +- **`kerno watch fd`** - FD leak detection via growth rate +- **`kerno start`** - daemon mode with Prometheus metrics + +### Integrations + +- **Prometheus** - 16 metrics at `/metrics`, ServiceMonitor support +- **Kubernetes** - Helm chart + pod enrichment (no API server load) +- **AI Providers** - Anthropic, OpenAI, Ollama (optional, opt-in) +- **Systemd** - unit/slice enrichment on bare metal + +
+ +--- + +## How It Works + +Kerno runs as a lightweight Go agent with six tiny eBPF programs attached to stable tracepoints. When `kerno doctor` runs, it collects 30 seconds of real kernel data, evaluates 11 diagnostic rules deterministically, and emits a ranked incident report. No sampling. No guesswork. No query language. + +### Architecture + +```mermaid +flowchart TB + subgraph Kernel["KERNEL SPACE · eBPF Programs"] + direction LR + P1["syscall
latency"] + P2["tcp
monitor"] + P3["oom
track"] + P4["disk
io"] + P5["sched
delay"] + P6["fd
track"] + end + + RB[("Ring Buffers
256KB per program
zero-copy mmap")] + + subgraph UserSpace["USER SPACE · Go"] + direction TB + Loader["BPF Loaders
cilium/ebpf"] + Collector["Collectors
percentile aggregation"] + Signals[("Signals Snapshot
single source of truth")] + Adapter["Environment Adapter
k8s · systemd · bare metal"] + end + + subgraph Outputs["OUTPUTS"] + direction TB + Doctor["Doctor Engine
11 diagnostic rules"] + AI["AI Layer (optional)
root cause analysis"] + Prom["Prometheus
/metrics :9090"] + CLI["Terminal
pretty · JSON"] + end + + P1 & P2 & P3 & P4 & P5 & P6 --> RB + RB --> Loader + Loader --> Collector + Collector --> Signals + Adapter -.enriches.-> Signals + Signals --> Doctor + Signals --> Prom + Doctor --> AI + AI --> CLI + Doctor --> CLI + + classDef kernel fill:#1a1a2e,stroke:#e94560,color:#fff,stroke-width:2px + classDef user fill:#0f3460,stroke:#16213e,color:#fff,stroke-width:2px + classDef output fill:#16213e,stroke:#533483,color:#fff,stroke-width:2px + classDef buffer fill:#533483,stroke:#e94560,color:#fff,stroke-width:3px + classDef ai fill:#e94560,stroke:#fff,color:#fff,stroke-width:2px + + class P1,P2,P3,P4,P5,P6 kernel + class Loader,Collector,Signals,Adapter user + class Doctor,Prom,CLI output + class RB buffer + class AI ai +``` + +### Core principles + +1. **Deterministic first.** The rule engine is pure Go, testable, and runs whether AI is on or off. Every finding has a clear cause, threshold, and fix. +2. **Zero-copy hot path.** Kernel events land in eBPF ring buffers and are drained via `mmap` - microsecond overhead, no serialization cost. +3. **No API server load.** Pod enrichment reads the kubelet's local pod manifests. The agent survives API server outages - the moment you need it most. +4. **AI is a post-processor.** Optional. Opt-in. Never touches the hot path. The deterministic engine always runs; AI enriches, it never replaces. +5. **Graceful degradation.** If an eBPF program fails to load on a weird kernel, that collector is skipped with a clear warning. The rest keep working. + +### Data flow + +```mermaid +sequenceDiagram + participant K as Kernel
(eBPF) + participant R as Ring Buffer + participant C as Collectors + participant D as Doctor Engine + participant A as AI Layer + participant U as On-call Engineer + + K->>R: syscall/tcp/oom/io events + Note over K,R: Zero-copy, microsecond overhead + R->>C: drain events + C->>C: aggregate into p50/p95/p99 + C->>D: Signals snapshot + D->>D: evaluate 11 rules + alt AI enabled + D->>A: findings + signals + A->>A: correlate + explain + A->>U: incident report + root cause + else AI disabled + D->>U: deterministic incident report + end +``` + +--- + +## The Diagnostic Rules + +Kerno runs 11 deterministic rules against every snapshot. Every rule is explainable, configurable, and covered by tests. + +| # | Rule | Triggers When | Severity | +|---|------|---------------|:---:| +| 1 | Disk I/O Bottleneck | fsync p99 > 50ms or write p99 > 200ms | WARN / CRIT | +| 2 | OOM Kill Occurred | Any OOM event in window | CRIT | +| 3 | TCP Retransmit Storm | Retransmit rate > 2% | CRIT | +| 4 | TCP RTT Degradation | RTT p99 > 10ms | WARN | +| 5 | Scheduler Contention | Runqueue delay p99 > 5ms | WARN / CRIT | +| 6 | FD Leak | FD growth > 10/sec sustained | WARN (with ETA) | +| 7 | Syscall Latency High | Any syscall p99 > 100ms | WARN / CRIT | +| 8 | OOM Imminent | Memory > 90% + positive growth | WARN / CRIT (with ETA) | +| 9 | Syscall Error Rate | Error rate > 1% per syscall | WARN / CRIT | +| 10 | Memory Pressure | RSS usage > 90% | WARN | +| 11 | Network Latency | Connection RTT > 100ms | WARN | + +--- + +## Usage + +### Incident diagnosis - "what broke just now?" + +```bash +# The golden command +kubectl -n kerno-system exec ds/kerno -- kerno doctor + +# Quick 10-second check +kubectl -n kerno-system exec ds/kerno -- kerno doctor --duration 10s + +# JSON for CI/CD, runbooks, Slack bots (non-zero exit on critical) +kubectl -n kerno-system exec ds/kerno -- kerno doctor --output json --exit-code + +# AI-powered root cause analysis +kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai + +# Explain a kernel error (no root, no cluster needed) +kerno explain "BUG: kernel NULL pointer dereference" +dmesg | tail -5 | kerno explain + +# Predict failures before they page you +kubectl -n kerno-system exec ds/kerno -- kerno predict --snapshots 5 --interval 15s +``` + +### Real-time tracing - "watch it happen" + +```bash +# Every syscall event streaming +kubectl -n kerno-system exec ds/kerno -- kerno trace syscall + +# Only syscalls from a specific pod's PID +kubectl -n kerno-system exec ds/kerno -- kerno trace syscall --pid 1234 + +# Postgres disk writes over 5ms +kubectl -n kerno-system exec ds/kerno -- kerno trace disk --process postgres --op write --threshold 5ms + +# Scheduler delays over 10ms +kubectl -n kerno-system exec ds/kerno -- kerno trace sched --threshold 10ms +``` + +### Continuous monitoring - "alert me when…" + +```bash +# TCP connections with retransmits +kubectl -n kerno-system exec ds/kerno -- kerno watch tcp --retransmits + +# Any OOM kill, with pod context +kubectl -n kerno-system exec ds/kerno -- kerno watch oom --alert + +# Processes leaking FDs +kubectl -n kerno-system exec ds/kerno -- kerno watch fd --threshold 10 +``` + +--- + +## Prometheus Metrics + +The DaemonSet exposes 16 metrics at `:9090/metrics`. ServiceMonitor is included when the Prometheus Operator is installed. + +
+View all 16 metrics + +| Metric | Type | What It Measures | +|---|:---:|---| +| `kerno_syscall_duration_nanoseconds` | Summary | Syscall latency (p50, p95, p99) | +| `kerno_syscall_total` | Counter | Total syscall events | +| `kerno_tcp_rtt_nanoseconds` | Summary | TCP round-trip time | +| `kerno_tcp_retransmits_total` | Counter | TCP retransmissions | +| `kerno_tcp_connections_total` | Counter | TCP connection events | +| `kerno_oom_kills_total` | Counter | OOM kill events | +| `kerno_disk_io_duration_nanoseconds` | Summary | Disk I/O latency | +| `kerno_disk_io_bytes_total` | Counter | Disk I/O bytes | +| `kerno_sched_delay_nanoseconds` | Summary | CPU run queue delay | +| `kerno_fd_open_total` | Counter | FD open operations | +| `kerno_fd_close_total` | Counter | FD close operations | +| `kerno_collector_events_total` | Counter | Events per collector | +| `kerno_collector_errors_total` | Counter | Errors per collector | +| `kerno_bpf_programs_loaded` | Gauge | Loaded eBPF programs | +| `kerno_info` | Gauge | Build version | + +Health endpoints: `/healthz` and `/readyz` return JSON status. + +
+ +--- + +## Environment & AI + +**Environment auto-detection.** Kerno picks one of three adapters and enriches every event - no configuration required: + +- **Kubernetes** (in-cluster token present) → pod, namespace, node, deployment +- **Systemd** (PID 1 is systemd) → unit, slice, scope +- **Bare metal** → hostname, cgroup path + +**AI (optional).** The AI layer runs **after** the deterministic rule engine - it correlates cross-signals and explains root causes, it never replaces rules. Three providers (**Anthropic**, **OpenAI**, **Ollama** for air-gapped), three privacy modes (`full` / `redacted` / `summary`), TTL cache + token-bucket rate limiting, graceful fallback to a deterministic template on failure. No LLM SDK dependencies - pure `net/http`. + +```bash +kubectl -n kerno-system set env ds/kerno \ + KERNO_AI_API_KEY=sk-... \ + KERNO_AI_PROVIDER=anthropic +kubectl -n kerno-system exec ds/kerno -- kerno doctor --ai +``` + +--- + +## Configuration + +Kerno works with **zero configuration**. For custom setups, mount a `config.yaml` or use `KERNO_*` env vars: + +```yaml +log_level: info + +collectors: + syscall_latency: true + tcp_monitor: true + oom_track: true + disk_io: true + sched_delay: true + fd_track: true + +doctor: + duration: 30s + thresholds: + syscall_p99_warning_ns: 100000000 # 100ms + syscall_p99_critical_ns: 500000000 # 500ms + tcp_retransmit_pct: 2.0 # 2% + oom_memory_pct: 90.0 # 90% + disk_p99_warning_ns: 50000000 # 50ms + disk_p99_critical_ns: 200000000 # 200ms + sched_delay_warning_ns: 5000000 # 5ms + sched_delay_critical_ns: 20000000 # 20ms + fd_growth_per_sec: 10.0 + +prometheus: + enabled: true + addr: ":9090" + +ai: + enabled: false + provider: anthropic + privacy_mode: summary +``` + +**Precedence:** CLI flags > environment variables (`KERNO_*`) > config file > defaults. + +--- + +## Roadmap + +See [TODO.md](TODO.md) for the full plan. Headlines: + +- **v0.1** - DaemonSet, 6 eBPF collectors, 11 rules, Prometheus, AI post-processor, 7 chaos scenarios, 13-phase verify pipeline - **shipped, all gates green on kernel 6.17** +- **v0.2** - CRD for cluster-wide incident policies, OpenTelemetry OTLP export, Grafana dashboards, sliding-window aggregation +- **v0.3** - historical incident replay, SLO-linked alerts, Slack / PagerDuty integrations +- **v1.0** - multi-cluster control plane, managed offering (Optiqor Cloud) + +--- + +## Building from Source + +```bash +# Requirements: Go 1.25+ +# Optional for real eBPF: clang 14+, libbpf-dev, llvm, bpftool + +make build # Build binary (uses BPF stubs - no clang needed) +make generate # Run bpf2go to produce *_bpfel.go from C sources +make bpf # Compile eBPF C programs to .o +make bpf-verify # Build the standalone kernel-verifier load harness +make test # Run unit tests +make test-race # Run with race detector +make lint # golangci-lint +make check # vet + test + lint +make verify # Comprehensive 13-phase production-readiness check +make demo # Record demo.gif via vhs (needs vhs + ttyd + ffmpeg) +make demo-cast # Record demo.cast via asciinema (alternative to vhs) +make docker # Build Docker image +``` + +**Reproducing the verifier proof end-to-end:** + +```bash +# Install eBPF toolchain +sudo apt-get install -y clang llvm libbpf-dev linux-tools-$(uname -r) jq + +# Build, generate, verify everything in one shot +make verify # exits 0 only if all 62 checks pass +``` + +**Inducing real incidents to demo or test rule firing:** + +```bash +sudo tc qdisc add dev lo root netem loss 30% # optional, for tcp-loss +kerno chaos --induce --intensity high --duration 30s + +# Available scenarios (kerno chaos --list): +# cpu scheduler_contention +# disk-sat disk_io_bottleneck +# fd-leak fd_leak +# memory oom_imminent +# tcp-churn scheduler_contention +# tcp-loss tcp_retransmit_storm +# cascade multiple +``` + +In another shell, `sudo kerno doctor` will catch the induced incident. + +--- + +## Contributing + +Contributions welcome. See [CONTRIBUTING.md](CONTRIBUTING.md) for: + +- Development setup and prerequisites +- Commit message conventions (Conventional Commits) +- Code review process +- DCO sign-off requirement + +For security reports, see [SECURITY.md](SECURITY.md). + +--- + +## License + +Apache License 2.0 - see [LICENSE](LICENSE). + +
+ +--- + +If Kerno saved your on-call shift, consider leaving a **⭐** it helps other engineers find the project. + +
diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml index 54bccf2..30639f6 100644 --- a/artifacthub-repo.yml +++ b/artifacthub-repo.yml @@ -1,6 +1,4 @@ # Artifact Hub repository metadata # https://artifacthub.io/docs/topics/repositories/helm-charts/ -repositoryID: 00000000-0000-0000-0000-000000000000 # Placeholder: update with real ID after manual submission -owners: - - name: Shivam Kumar - email: btwshivam@example.com # Replace with real maintainer email if known +# repositoryID is generated by Artifact Hub after the repo is submitted via the UI. +# Add it here once the submission is complete. diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index 90ec66e..cdc5d70 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -1,27 +1,23 @@ -apiVersion: v2 -name: kerno -description: eBPF-based kernel observability engine for Kubernetes -type: application -version: 0.1.0 -appVersion: "0.1.0" -home: https://github.com/optiqor/kerno -sources: - - https://github.com/optiqor/kerno -maintainers: - - name: Shivam Kumar - url: https://github.com/btwshivam -keywords: - - ebpf - - observability - - kernel - - prometheus - - monitoring -annotations: - artifacthub.io/license: Apache-2.0 - artifacthub.io/signKey: "" - artifacthub.io/containsSecurityUpdates: "false" - artifacthub.io/prerelease: "false" - artifacthub.io/maintainers: | - - name: Shivam Kumar - github: btwshivam -icon: https://raw.githubusercontent.com/optiqor/kerno/main/demo.gif # Using demo.gif as a placeholder until a logo is available +apiVersion: v2 +name: kerno +description: eBPF-based kernel observability engine for Kubernetes +type: application +version: 0.1.0 +appVersion: "0.1.0" +home: https://github.com/optiqor/kerno +sources: + - https://github.com/optiqor/kerno +maintainers: + - name: Shivam Kumar + url: https://github.com/btwshivam +keywords: + - ebpf + - observability + - kernel + - prometheus + - monitoring +annotations: + artifacthub.io/license: Apache-2.0 + artifacthub.io/signKey: "" + artifacthub.io/containsSecurityUpdates: "false" + artifacthub.io/prerelease: "false" From 88b00cd1a99e5dda97167a83a9dc494700eef49f Mon Sep 17 00:00:00 2001 From: Kshitij Kumar Singh Date: Sat, 23 May 2026 05:59:31 +0530 Subject: [PATCH 07/14] feat(cli): add shell completion for bash, zsh, fish, powershell (#55) * feat(cli): add shell completion for bash, zsh, fish, powershell - Add kerno completion command - Auto-detect shell in install.sh and install completion - Register --output flag values (pretty/json) for tab completion - Update README with shell-specific setup instructions - Mark completion command as hidden (plumbing, not feature) Signed-off-by: Kshitij-K-Singh * feat(cli): add shell completion for bash, zsh, fish, powershell - Add kerno completion command - Auto-detect shell in install.sh and install completion - Register --output flag values (pretty/json) for tab completion - Update README with shell-specific setup instructions - Use system-wide zsh path (/usr/local/share/zsh/site-functions) - Show completion in --help (not hidden) Signed-off-by: Kshitij-K-Singh * fix(install): use system wide fish completion path Signed-off-by: Kshitij-K-Singh * fix(cli,install): use cmd.OutOrStdout() and explicit kerno path Signed-off-by: Kshitij-K-Singh --------- Signed-off-by: Kshitij-K-Singh --- README.md | 46 +++++++++++++++++++ internal/cli/completion.go | 78 +++++++++++++++++++++++++++++++++ internal/cli/completion_test.go | 26 +++++++++++ internal/cli/doctor.go | 5 +++ internal/cli/root.go | 4 +- scripts/install.sh | 63 +++++++++++++++++++++++++- 6 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 internal/cli/completion.go create mode 100644 internal/cli/completion_test.go diff --git a/README.md b/README.md index bc1ece7..4a90a42 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,52 @@ docker run --rm --privileged --pid=host \ Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. +### Shell Completion + +Enable tab completion for your shell: + +**Bash:** + +```bash +# Load completions for current session +source <(kerno completion bash) + +# Persist across sessions +echo 'source <(kerno completion bash)' >> ~/.bashrc +``` + +**Zsh:** + +```bash +# Enable completions (add to ~/.zshrc if not already present) +echo 'autoload -U compinit; compinit' >> ~/.zshrc + +# Load completions for current session +autoload -U compinit && compinit +kerno completion zsh > "${fpath[1]}/_kerno" + +# Persist across sessions - run once, then start new shell +kerno completion zsh > "${fpath[1]}/_kerno" +``` + +**Fish:** + +```bash +# Load completions for current session +kerno completion fish | source + +# Persist across sessions +kerno completion fish > ~/.config/fish/completions/kerno.fish +``` + +**PowerShell:** + +```powershell +# Add to your PowerShell profile +kerno completion powershell > kerno.ps1 +. ./kerno.ps1 +``` + --- ## Kubernetes Deployment diff --git a/internal/cli/completion.go b/internal/cli/completion.go new file mode 100644 index 0000000..7a2067e --- /dev/null +++ b/internal/cli/completion.go @@ -0,0 +1,78 @@ +// Copyright 2026 Optiqor contributors +// SPDX-License-Identifier: Apache-2.0 + +package cli + +import ( + "github.com/spf13/cobra" +) + +func newCompletionCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "completion [bash|zsh|fish|powershell]", + Short: "Generate shell completion scripts", + Long: `Generate shell completion scripts for kerno. + +Kerno uses spf13/cobra's built-in completion generation which supports +bash, zsh, fish, and powershell. + +To load completions: + +Bash: + + $ source <(kerno completion bash) + + # To load completions for each session, execute once: + $ echo 'source <(kerno completion bash)' >> ~/.bashrc + +Zsh: + + # If shell completion is not already enabled in your zsh environment, + # you need to enable it. You can execute the following once: + + $ echo 'autoload -U compinit; compinit' >> ~/.zshrc + + # To load completions for each session, execute once: + $ kerno completion zsh > "${fpath[1]}/_kerno" + + # You will need to start a new shell for this setup to take effect. + +Fish: + + $ kerno completion fish | source + + # To load completions for each session, execute once: + $ kerno completion fish > ~/.config/fish/completions/kerno.fish + +PowerShell: + + PS> kerno completion powershell > kerno.ps1 + # and source this file from your PowerShell profile. + +Alternatively, specify the shell with the first argument: + + $ kerno completion bash + $ kerno completion zsh + $ kerno completion fish + $ kerno completion powershell +`, + DisableFlagsInUseLine: true, + ValidArgs: []string{"bash", "zsh", "fish", "powershell"}, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + switch args[0] { + case "bash": + return cmd.Root().GenBashCompletion(cmd.OutOrStdout()) + case "zsh": + return cmd.Root().GenZshCompletion(cmd.OutOrStdout()) + case "fish": + return cmd.Root().GenFishCompletion(cmd.OutOrStdout(), true) + case "powershell": + return cmd.Root().GenPowerShellCompletionWithDesc(cmd.OutOrStdout()) + } + return nil + }, + } + + return cmd +} diff --git a/internal/cli/completion_test.go b/internal/cli/completion_test.go new file mode 100644 index 0000000..a4d49a8 --- /dev/null +++ b/internal/cli/completion_test.go @@ -0,0 +1,26 @@ +// Copyright 2026 Optiqor contributors +// SPDX-License-Identifier: Apache-2.0 + +package cli + +import ( + "bytes" + "testing" +) + +func TestCompletionCmd(t *testing.T) { + for _, shell := range []string{"bash", "zsh", "fish", "powershell"} { + t.Run(shell, func(t *testing.T) { + cmd := New() + var buf bytes.Buffer + cmd.SetOut(&buf) + cmd.SetArgs([]string{"completion", shell}) + if err := cmd.Execute(); err != nil { + t.Fatalf("%s: %v", shell, err) + } + if buf.Len() == 0 { + t.Errorf("%s: empty completion output", shell) + } + }) + } +} diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go index 8a0e3c6..3f794a6 100644 --- a/internal/cli/doctor.go +++ b/internal/cli/doctor.go @@ -97,6 +97,11 @@ Add --ai to enrich findings with AI-powered analysis (requires API key).`, flags.BoolVarP(&quiet, "quiet", "q", false, "only emit critical/warning findings (CI-friendly)") flags.BoolVar(&noBanner, "no-banner", false, "suppress the ASCII banner block") + //nolint:errcheck // RegisterFlagCompletionFunc only returns error on invalid flag name, which is static. + _ = cmd.RegisterFlagCompletionFunc("output", func(_ *cobra.Command, _ []string, _ string) ([]string, cobra.ShellCompDirective) { + return []string{"pretty", "json"}, cobra.ShellCompDirectiveNoFileComp + }) + return cmd } diff --git a/internal/cli/root.go b/internal/cli/root.go index 6db7b23..0a3f04e 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -83,6 +83,7 @@ and copy-paste fix steps.`, auditCmd := newAuditCmd() chaosCmd := newChaosCmd() versionCmd := newVersionCmd() + completionCmd := newCompletionCmd() root.AddGroup( &cobra.Group{ID: "diagnose", Title: "Incident diagnosis:"}, @@ -98,8 +99,9 @@ and copy-paste fix steps.`, startCmd.GroupID = "ops" chaosCmd.GroupID = "ops" versionCmd.GroupID = "ops" + completionCmd.GroupID = "ops" - root.AddCommand(doctorCmd, explainCmd, predictCmd, traceCmd, watchCmd, auditCmd, startCmd, chaosCmd, versionCmd) + root.AddCommand(doctorCmd, explainCmd, predictCmd, traceCmd, watchCmd, auditCmd, startCmd, chaosCmd, versionCmd, completionCmd) return root } diff --git a/scripts/install.sh b/scripts/install.sh index a417eb9..1c74863 100755 --- a/scripts/install.sh +++ b/scripts/install.sh @@ -30,17 +30,20 @@ CONFIG_DIR="/etc/kerno" # ── Parse arguments ────────────────────────────────────────────────── VERSION="" INSTALL_DAEMON=false +INSTALL_COMPLETION=true while [[ $# -gt 0 ]]; do case "$1" in --version) VERSION="$2"; shift 2 ;; --daemon) INSTALL_DAEMON=true; shift ;; + --no-completion) INSTALL_COMPLETION=false; shift ;; --help|-h) echo "Usage: curl -sfL https://raw.githubusercontent.com/optiqor/kerno/main/scripts/install.sh | bash -s -- [OPTIONS]" echo "" echo "Options:" echo " --version VERSION Install a specific version (default: latest)" echo " --daemon Also install systemd service for daemon mode" + echo " --no-completion Skip shell completion installation" echo " --help Show this help" exit 0 ;; @@ -88,6 +91,59 @@ check_root() { fi } +# ── Shell completion installation ────────────────────────────────── +detect_shell() { + local shell + shell="${SHELL##*/}" + case "$shell" in + bash) echo "bash" ;; + zsh) echo "zsh" ;; + fish) echo "fish" ;; + *) echo "" ;; + esac +} + +install_completion() { + local shell + shell=$(detect_shell) + + if [ -z "$shell" ]; then + echo "==> Shell completion: could not detect shell (SHELL=$SHELL)" + echo " Manually enable completion: https://github.com/optiqor/kerno#shell-completion" + return + fi + + echo "" + echo "==> Installing shell completion for $shell..." + + case "$shell" in + bash) + local bash_dir="/etc/bash_completion.d" + mkdir -p "$bash_dir" + "${INSTALL_DIR}/kerno" completion bash > "${bash_dir}/kerno" + chmod 644 "${bash_dir}/kerno" + echo " Installed to ${bash_dir}/kerno" + echo " Restart shell or run: source ${bash_dir}/kerno" + ;; + zsh) + local zsh_dir="/usr/local/share/zsh/site-functions" + mkdir -p "$zsh_dir" + "${INSTALL_DIR}/kerno" completion zsh > "${zsh_dir}/_kerno" + chmod 644 "${zsh_dir}/_kerno" + echo " Installed to ${zsh_dir}/_kerno" + echo " Restart shell or run: autoload -U compinit && compinit" + ;; + fish) + local fish_dir="/usr/share/fish/vendor_completions.d" + mkdir -p "$fish_dir" + "${INSTALL_DIR}/kerno" completion fish > "${fish_dir}/kerno.fish" + chmod 644 "${fish_dir}/kerno.fish" + echo " Installed to ${fish_dir}/kerno.fish" + echo " Restart fish to load the new completion" + ;; + esac +} + # ── Download ───────────────────────────────────────────────────────── get_latest_version() { curl -fsSL "https://api.github.com/repos/${REPO}/releases/latest" \ @@ -142,7 +198,12 @@ main() { echo "==> Installed: $(kerno version 2>/dev/null || echo "${INSTALL_DIR}/kerno")" - # ── Optional: systemd daemon ───────────────────────────────────── + # ── Optional: shell completion ──────────────────────────────────── + if [ "$INSTALL_COMPLETION" = true ]; then + install_completion + fi + + # ── Optional: systemd daemon ─────────────────────────────────────── if [ "$INSTALL_DAEMON" = true ]; then echo "" echo "==> Installing systemd service..." From c66b7b367a4b8958f29c9d7af140ef9a4ca83f51 Mon Sep 17 00:00:00 2001 From: Vidheendu Chaturvedi Date: Sat, 23 May 2026 06:00:35 +0530 Subject: [PATCH 08/14] feat(cli): add no-color support for CLI output (#57) * feat(cli): add no-color support for CLI output * fix: address PR review comments * refactor(cli): remove redundant NO_COLOR handling Signed-off-by: Vidheendu Chaturvedi * chore: rerun ci * fix(cli): format doctor renderer Signed-off-by: Vidheendu Chaturvedi --------- Signed-off-by: Vidheendu Chaturvedi --- internal/cli/doctor.go | 3 ++- internal/cli/root.go | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go index 3f794a6..aeecc8f 100644 --- a/internal/cli/doctor.go +++ b/internal/cli/doctor.go @@ -13,6 +13,7 @@ import ( "time" "github.com/spf13/cobra" + "github.com/spf13/viper" "github.com/optiqor/kerno/internal/adapter" "github.com/optiqor/kerno/internal/ai" @@ -155,7 +156,7 @@ func runDoctor(ctx context.Context, opts doctorOpts) error { renderer = &doctor.JSONRenderer{Pretty: true} default: renderer = &doctor.PrettyRenderer{ - NoColor: os.Getenv("NO_COLOR") != "" || !isTerminal(), + NoColor: viper.GetBool("no_color") || os.Getenv("NO_COLOR") != "" || !isTerminal(), NoBanner: opts.noBanner, } } diff --git a/internal/cli/root.go b/internal/cli/root.go index 0a3f04e..ae81213 100644 --- a/internal/cli/root.go +++ b/internal/cli/root.go @@ -135,10 +135,15 @@ func initConfig(cmd *cobra.Command) error { if err := v.BindPFlag("log_level", cmd.Root().PersistentFlags().Lookup("log-level")); err != nil { return fmt.Errorf("binding log-level flag: %w", err) } + if err := v.BindPFlag("log_format", cmd.Root().PersistentFlags().Lookup("log-format")); err != nil { return fmt.Errorf("binding log-format flag: %w", err) } + if err := v.BindPFlag("no_color", cmd.Root().PersistentFlags().Lookup("no-color")); err != nil { + return fmt.Errorf("binding no-color flag: %w", err) + } + // Read config file (not an error if it doesn't exist). if err := v.ReadInConfig(); err != nil { var notFound viper.ConfigFileNotFoundError From bffb27ad58e3d1cfa74e7658fee91734f900331e Mon Sep 17 00:00:00 2001 From: Kshitij Kumar Singh Date: Sat, 23 May 2026 06:04:50 +0530 Subject: [PATCH 09/14] feat(cli): add man page generation via make manpage (#73) * feat(cli): add man page generation via make manpage Signed-off-by: Kshitij-K-Singh * fix(manpages): removed dead code Signed-off-by: Kshitij-K-Singh --------- Signed-off-by: Kshitij-K-Singh --- .gitignore | 3 +++ Makefile | 9 ++++++++- README.md | 1 + cmd/kerno-mangen/main.go | 34 ++++++++++++++++++++++++++++++++++ go.mod | 2 ++ go.sum | 2 ++ 6 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 cmd/kerno-mangen/main.go diff --git a/.gitignore b/.gitignore index 6e3fdb9..853f035 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,9 @@ Thumbs.db # Debug __debug_bin* +# Man pages (generatable via make manpage) +docs/man/*.1 + # Environment .env .env.local diff --git a/Makefile b/Makefile index 3e64e1c..b284b89 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ UI_DIST_DIR := internal/dashboard/dist/assets .PHONY: all build build-ebpf build-debug test test-cover test-race lint vet check \ fmt clean bpf generate docker help \ ui-fetch ui-dev install-tools setup precommit \ - verify demo demo-cast bpf-verify + verify demo demo-cast bpf-verify manpage .DEFAULT_GOAL := help @@ -167,6 +167,13 @@ docker: --build-arg DATE=$(DATE) \ . +# ─── Man Pages ──────────────────────────────────────────────────────────────── + +## manpage: Generate man pages for all CLI commands +manpage: + @mkdir -p docs/man + go run ./cmd/kerno-mangen/ + # ─── Utilities ─────────────────────────────────────────────────────────────── ## install-tools: Install Go-based development tools (golangci-lint, bpf2go) diff --git a/README.md b/README.md index 4a90a42..f71c00b 100644 --- a/README.md +++ b/README.md @@ -644,6 +644,7 @@ make test-race # Run with race detector make lint # golangci-lint make check # vet + test + lint make verify # Comprehensive 13-phase production-readiness check +make manpage # Generate man pages for all CLI commands make demo # Record demo.gif via vhs (needs vhs + ttyd + ffmpeg) make demo-cast # Record demo.cast via asciinema (alternative to vhs) make docker # Build Docker image diff --git a/cmd/kerno-mangen/main.go b/cmd/kerno-mangen/main.go new file mode 100644 index 0000000..9c91e58 --- /dev/null +++ b/cmd/kerno-mangen/main.go @@ -0,0 +1,34 @@ +// Copyright 2026 Optiqor contributors +// SPDX-License-Identifier: Apache-2.0 + +// Package main generates man pages for kerno CLI commands. +package main + +import ( + "log" + "os" + + "github.com/spf13/cobra/doc" + + "github.com/optiqor/kerno/internal/cli" +) + +func main() { + root := cli.New() + manDir := "docs/man" + + if err := os.MkdirAll(manDir, 0o750); err != nil { + log.Fatalf("creating man dir: %v", err) + } + + header := &doc.GenManHeader{ + Title: "KERNO", + Section: "1", + } + + if err := doc.GenManTree(root, header, manDir); err != nil { + log.Fatalf("generating man pages: %v", err) + } + + log.Printf("Generated man pages in %s", manDir) +} diff --git a/go.mod b/go.mod index 1820dc0..956a0dd 100644 --- a/go.mod +++ b/go.mod @@ -13,6 +13,7 @@ require ( require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect @@ -22,6 +23,7 @@ require ( github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.16.1 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/sagikazarmark/locafero v0.11.0 // indirect github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 // indirect github.com/spf13/afero v1.15.0 // indirect diff --git a/go.sum b/go.sum index dcecec5..1992f4a 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,7 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cilium/ebpf v0.21.0 h1:4dpx1J/B/1apeTmWBH5BkVLayHTkFrMovVPnHEk+l3k= github.com/cilium/ebpf v0.21.0/go.mod h1:1kHKv6Kvh5a6TePP5vvvoMa1bclRyzUXELSs272fmIQ= +github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -51,6 +52,7 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDcg+AAIFXc= github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= From 27025a5f0c1500de22968ca4a3f1f32a46e925c4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 23 May 2026 06:08:49 +0530 Subject: [PATCH 10/14] build(deps): bump actions/github-script from 7 to 9 in the actions group (#88) Bumps the actions group with 1 update: [actions/github-script](https://github.com/actions/github-script). Updates `actions/github-script` from 7 to 9 - [Release notes](https://github.com/actions/github-script/releases) - [Commits](https://github.com/actions/github-script/compare/v7...v9) --- updated-dependencies: - dependency-name: actions/github-script dependency-version: '9' dependency-type: direct:production update-type: version-update:semver-major dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/enforce-issue-cap.yml | 2 +- .github/workflows/pr-commands.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/enforce-issue-cap.yml b/.github/workflows/enforce-issue-cap.yml index 45c58c1..f4ed4cc 100644 --- a/.github/workflows/enforce-issue-cap.yml +++ b/.github/workflows/enforce-issue-cap.yml @@ -35,7 +35,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Find over-cap contributors and trim - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const repo = { owner: context.repo.owner, repo: context.repo.repo }; diff --git a/.github/workflows/pr-commands.yml b/.github/workflows/pr-commands.yml index a48903e..8e610fb 100644 --- a/.github/workflows/pr-commands.yml +++ b/.github/workflows/pr-commands.yml @@ -42,7 +42,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Parse and dispatch - uses: actions/github-script@v7 + uses: actions/github-script@v9 with: script: | const body = context.payload.comment.body || ''; From 57efe1fa7c2ea5e522df7c4cb17457e82edb85db Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sun, 24 May 2026 20:13:57 +0530 Subject: [PATCH 11/14] feat: add initial Helm chart for Kerno and configure Artifact Hub metadata --- artifacthub-repo.yml | 11 +++++++++-- deploy/helm/kerno/Chart.yaml | 3 +++ deploy/helm/kerno/README.md | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml index 30639f6..6c65b41 100644 --- a/artifacthub-repo.yml +++ b/artifacthub-repo.yml @@ -1,4 +1,11 @@ # Artifact Hub repository metadata # https://artifacthub.io/docs/topics/repositories/helm-charts/ -# repositoryID is generated by Artifact Hub after the repo is submitted via the UI. -# Add it here once the submission is complete. +# +# To register this chart on Artifact Hub: +# 1. Go to https://artifacthub.io and sign in. +# 2. Add a new Helm Charts repository with the URL: +# https://optiqor.github.io/kerno-charts +# 3. Artifact Hub will generate a repositoryID (UUID). +# 4. Uncomment and fill in the lines below, then open a PR. +# +# repositoryID: diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index cdc5d70..acb9530 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -21,3 +21,6 @@ annotations: artifacthub.io/signKey: "" artifacthub.io/containsSecurityUpdates: "false" artifacthub.io/prerelease: "false" + # artifacthub.io/maintainers annotation intentionally omitted: + # Artifact Hub reads the standard maintainers: field above automatically. + # icon: intentionally omitted until a proper static PNG/SVG logo is available. diff --git a/deploy/helm/kerno/README.md b/deploy/helm/kerno/README.md index 49409b5..ba56c31 100644 --- a/deploy/helm/kerno/README.md +++ b/deploy/helm/kerno/README.md @@ -11,7 +11,7 @@ Kerno is an eBPF-based kernel observability engine for Kubernetes. It diagnoses ## Installation ```bash -helm repo add kerno https://optiqor.github.io/kerno +helm repo add kerno https://optiqor.github.io/kerno-charts helm repo update helm install kerno kerno/kerno -n kerno-system --create-namespace ``` From ebbdff82694307cee37cb6ab507ee84834d0f2a9 Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sat, 16 May 2026 08:35:42 +0530 Subject: [PATCH 12/14] feat(release): Helm chart distribution to Artifact Hub and multi-arch builds - Added helm-release.yml workflow for automated chart publishing. - Added artifacthub-repo.yml and Chart.yaml annotations for Artifact Hub. - Updated ci.yml for multi-arch (amd64/arm64) Docker builds. - Added Helm chart README and updated main README instructions. Signed-off-by: Sanjaykumar.M --- .github/workflows/ci.yml | 4 +++ .github/workflows/helm-release.yml | 49 +++++++++++++++++++++++++++ README.md | 8 +++-- artifacthub-repo.yml | 6 ++++ deploy/helm/kerno/Chart.yaml | 9 +++++ deploy/helm/kerno/README.md | 54 ++++++++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/helm-release.yml create mode 100644 artifacthub-repo.yml create mode 100644 deploy/helm/kerno/README.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cb96740..c4f5e2d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -152,6 +152,9 @@ jobs: steps: - uses: actions/checkout@v6 + - name: Set up QEMU + uses: docker/setup-qemu-action@v4 + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v4 @@ -160,6 +163,7 @@ jobs: with: context: . push: false + platforms: linux/amd64,linux/arm64 tags: ghcr.io/optiqor/kerno:ci cache-from: type=gha cache-to: type=gha,mode=max diff --git a/.github/workflows/helm-release.yml b/.github/workflows/helm-release.yml new file mode 100644 index 0000000..d1119ef --- /dev/null +++ b/.github/workflows/helm-release.yml @@ -0,0 +1,49 @@ +# Copyright 2026 Optiqor contributors +# SPDX-License-Identifier: Apache-2.0 + +name: Helm Chart Release + +on: + push: + tags: + - "v*" + +permissions: + contents: write + +jobs: + release-chart: + name: Release Chart + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Install Helm + uses: azure/setup-helm@v4 + + - name: Run chart-releaser + uses: helm/chart-releaser-action@v1 + with: + charts_dir: deploy/helm + env: + CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" + + # After chart-releaser finishes, it has updated gh-pages. + # We need to ensure artifacthub-repo.yml is in the root of gh-pages. + - name: Push Artifact Hub metadata + run: | + git checkout gh-pages + cp .github/artifacthub-repo.yml ./artifacthub-repo.yml || cp artifacthub-repo.yml ./artifacthub-repo.yml || true + if [ -f artifacthub-repo.yml ]; then + git add artifacthub-repo.yml + git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" + git push origin gh-pages + fi diff --git a/README.md b/README.md index f71c00b..7593b0d 100644 --- a/README.md +++ b/README.md @@ -118,12 +118,16 @@ Kerno is the only eBPF tool in the Kubernetes ecosystem that produces a ranked, ### 1 · Kubernetes (primary) ```bash -helm install kerno ./deploy/helm/kerno \ +helm repo add kerno https://optiqor.github.io/kerno +helm repo update +helm install kerno kerno/kerno \ -n kerno-system --create-namespace ``` Within 30 seconds Kerno is running as a DaemonSet on every node, watching the kernel via eBPF, exposing `/metrics` for Prometheus, and ready for `kerno doctor`. +> **Tip:** If you prefer to install from a local clone: `helm install kerno ./deploy/helm/kerno -n kerno-system --create-namespace` + ```bash # Cluster-wide incident report - 30 seconds of real kernel data kubectl -n kerno-system exec ds/kerno -- kerno doctor @@ -167,7 +171,7 @@ docker run --rm --privileged --pid=host \ ghcr.io/optiqor/kerno:latest doctor ``` -Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. +Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. Graviton, Apple Silicon, and Raspberry Pi clusters work out of the box. ### Shell Completion diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml new file mode 100644 index 0000000..54bccf2 --- /dev/null +++ b/artifacthub-repo.yml @@ -0,0 +1,6 @@ +# Artifact Hub repository metadata +# https://artifacthub.io/docs/topics/repositories/helm-charts/ +repositoryID: 00000000-0000-0000-0000-000000000000 # Placeholder: update with real ID after manual submission +owners: + - name: Shivam Kumar + email: btwshivam@example.com # Replace with real maintainer email if known diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index 4e527b5..90ec66e 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -16,3 +16,12 @@ keywords: - kernel - prometheus - monitoring +annotations: + artifacthub.io/license: Apache-2.0 + artifacthub.io/signKey: "" + artifacthub.io/containsSecurityUpdates: "false" + artifacthub.io/prerelease: "false" + artifacthub.io/maintainers: | + - name: Shivam Kumar + github: btwshivam +icon: https://raw.githubusercontent.com/optiqor/kerno/main/demo.gif # Using demo.gif as a placeholder until a logo is available diff --git a/deploy/helm/kerno/README.md b/deploy/helm/kerno/README.md new file mode 100644 index 0000000..49409b5 --- /dev/null +++ b/deploy/helm/kerno/README.md @@ -0,0 +1,54 @@ +# Kerno Helm Chart + +Kerno is an eBPF-based kernel observability engine for Kubernetes. It diagnoses production incidents by watching kernel signals (disk, TCP, OOM, scheduler) and providing a ranked diagnostic report. + +## Prerequisites + +- Kubernetes 1.22+ +- Helm 3.8.0+ +- Linux kernel 5.8+ with BTF enabled (standard on EKS, GKE, AKS, etc.) + +## Installation + +```bash +helm repo add kerno https://optiqor.github.io/kerno +helm repo update +helm install kerno kerno/kerno -n kerno-system --create-namespace +``` + +## Configuration + +The following table lists the most common configurable parameters of the Kerno chart and their default values. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `image.repository` | Image repository | `ghcr.io/optiqor/kerno` | +| `image.tag` | Image tag | `{{ .Chart.AppVersion }}` | +| `resources.requests.cpu` | CPU requests | `100m` | +| `resources.requests.memory` | Memory requests | `128Mi` | +| `prometheus.enabled` | Enable Prometheus metrics | `true` | +| `collectors.syscallLatency` | Enable syscall latency collector | `true` | +| `collectors.tcpMonitor` | Enable TCP monitor collector | `true` | +| `collectors.oomTrack` | Enable OOM tracker | `true` | +| `collectors.diskIO` | Enable Disk I/O collector | `true` | +| `collectors.schedDelay` | Enable scheduler delay collector | `true` | +| `collectors.fdTrack` | Enable file descriptor tracker | `true` | + +For a full list of parameters, see [values.yaml](values.yaml). + +## Examples + +### Enable AI Diagnosis +```bash +helm install kerno kerno/kerno \ + --set extraEnv[0].name=KERNO_AI_API_KEY \ + --set extraEnv[0].value=your-key \ + --set extraEnv[1].name=KERNO_AI_PROVIDER \ + --set extraEnv[1].value=anthropic +``` + +## Version Compatibility Matrix + +| Kerno Version | K8s Version | Kernel Version | +|---------------|-------------|----------------| +| v0.1.x | 1.22 - 1.31 | 5.8+ | From b9662e5997064e403683b684830ede6f36e87d2b Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sun, 17 May 2026 08:03:44 +0530 Subject: [PATCH 13/14] feat: implement Helm chart packaging and automated release workflow via GitHub Actions --- .github/workflows/helm-release.yml | 11 +++---- README.md | 49 +---------------------------- artifacthub-repo.yml | 6 ++-- deploy/helm/kerno/Chart.yaml | 50 ++++++++++++++---------------- 4 files changed, 31 insertions(+), 85 deletions(-) diff --git a/.github/workflows/helm-release.yml b/.github/workflows/helm-release.yml index d1119ef..91b973b 100644 --- a/.github/workflows/helm-release.yml +++ b/.github/workflows/helm-release.yml @@ -40,10 +40,9 @@ jobs: # We need to ensure artifacthub-repo.yml is in the root of gh-pages. - name: Push Artifact Hub metadata run: | + cp artifacthub-repo.yml /tmp/ahr.yml git checkout gh-pages - cp .github/artifacthub-repo.yml ./artifacthub-repo.yml || cp artifacthub-repo.yml ./artifacthub-repo.yml || true - if [ -f artifacthub-repo.yml ]; then - git add artifacthub-repo.yml - git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" - git push origin gh-pages - fi + cp /tmp/ahr.yml ./artifacthub-repo.yml + git add artifacthub-repo.yml + git commit -m "chore: update artifacthub-repo.yml" || echo "No changes to commit" + git push origin gh-pages diff --git a/README.md b/README.md index 7593b0d..87dc97a 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ Kerno is the only eBPF tool in the Kubernetes ecosystem that produces a ranked, ### 1 · Kubernetes (primary) ```bash -helm repo add kerno https://optiqor.github.io/kerno +helm repo add kerno https://optiqor.github.io/kerno-charts helm repo update helm install kerno kerno/kerno \ -n kerno-system --create-namespace @@ -173,52 +173,6 @@ docker run --rm --privileged --pid=host \ Multi-arch (`linux/amd64`, `linux/arm64`) images published to GHCR on every release. Graviton, Apple Silicon, and Raspberry Pi clusters work out of the box. -### Shell Completion - -Enable tab completion for your shell: - -**Bash:** - -```bash -# Load completions for current session -source <(kerno completion bash) - -# Persist across sessions -echo 'source <(kerno completion bash)' >> ~/.bashrc -``` - -**Zsh:** - -```bash -# Enable completions (add to ~/.zshrc if not already present) -echo 'autoload -U compinit; compinit' >> ~/.zshrc - -# Load completions for current session -autoload -U compinit && compinit -kerno completion zsh > "${fpath[1]}/_kerno" - -# Persist across sessions - run once, then start new shell -kerno completion zsh > "${fpath[1]}/_kerno" -``` - -**Fish:** - -```bash -# Load completions for current session -kerno completion fish | source - -# Persist across sessions -kerno completion fish > ~/.config/fish/completions/kerno.fish -``` - -**PowerShell:** - -```powershell -# Add to your PowerShell profile -kerno completion powershell > kerno.ps1 -. ./kerno.ps1 -``` - --- ## Kubernetes Deployment @@ -648,7 +602,6 @@ make test-race # Run with race detector make lint # golangci-lint make check # vet + test + lint make verify # Comprehensive 13-phase production-readiness check -make manpage # Generate man pages for all CLI commands make demo # Record demo.gif via vhs (needs vhs + ttyd + ffmpeg) make demo-cast # Record demo.cast via asciinema (alternative to vhs) make docker # Build Docker image diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml index 54bccf2..30639f6 100644 --- a/artifacthub-repo.yml +++ b/artifacthub-repo.yml @@ -1,6 +1,4 @@ # Artifact Hub repository metadata # https://artifacthub.io/docs/topics/repositories/helm-charts/ -repositoryID: 00000000-0000-0000-0000-000000000000 # Placeholder: update with real ID after manual submission -owners: - - name: Shivam Kumar - email: btwshivam@example.com # Replace with real maintainer email if known +# repositoryID is generated by Artifact Hub after the repo is submitted via the UI. +# Add it here once the submission is complete. diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index 90ec66e..cdc5d70 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -1,27 +1,23 @@ -apiVersion: v2 -name: kerno -description: eBPF-based kernel observability engine for Kubernetes -type: application -version: 0.1.0 -appVersion: "0.1.0" -home: https://github.com/optiqor/kerno -sources: - - https://github.com/optiqor/kerno -maintainers: - - name: Shivam Kumar - url: https://github.com/btwshivam -keywords: - - ebpf - - observability - - kernel - - prometheus - - monitoring -annotations: - artifacthub.io/license: Apache-2.0 - artifacthub.io/signKey: "" - artifacthub.io/containsSecurityUpdates: "false" - artifacthub.io/prerelease: "false" - artifacthub.io/maintainers: | - - name: Shivam Kumar - github: btwshivam -icon: https://raw.githubusercontent.com/optiqor/kerno/main/demo.gif # Using demo.gif as a placeholder until a logo is available +apiVersion: v2 +name: kerno +description: eBPF-based kernel observability engine for Kubernetes +type: application +version: 0.1.0 +appVersion: "0.1.0" +home: https://github.com/optiqor/kerno +sources: + - https://github.com/optiqor/kerno +maintainers: + - name: Shivam Kumar + url: https://github.com/btwshivam +keywords: + - ebpf + - observability + - kernel + - prometheus + - monitoring +annotations: + artifacthub.io/license: Apache-2.0 + artifacthub.io/signKey: "" + artifacthub.io/containsSecurityUpdates: "false" + artifacthub.io/prerelease: "false" From ca08b9905ad444a5d9833dbcf8b13e284e8bf837 Mon Sep 17 00:00:00 2001 From: Sanjaykumar-2005 Date: Sun, 24 May 2026 20:13:57 +0530 Subject: [PATCH 14/14] feat: add initial Helm chart for Kerno and configure Artifact Hub metadata --- artifacthub-repo.yml | 11 +++++++++-- deploy/helm/kerno/Chart.yaml | 3 +++ deploy/helm/kerno/README.md | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/artifacthub-repo.yml b/artifacthub-repo.yml index 30639f6..6c65b41 100644 --- a/artifacthub-repo.yml +++ b/artifacthub-repo.yml @@ -1,4 +1,11 @@ # Artifact Hub repository metadata # https://artifacthub.io/docs/topics/repositories/helm-charts/ -# repositoryID is generated by Artifact Hub after the repo is submitted via the UI. -# Add it here once the submission is complete. +# +# To register this chart on Artifact Hub: +# 1. Go to https://artifacthub.io and sign in. +# 2. Add a new Helm Charts repository with the URL: +# https://optiqor.github.io/kerno-charts +# 3. Artifact Hub will generate a repositoryID (UUID). +# 4. Uncomment and fill in the lines below, then open a PR. +# +# repositoryID: diff --git a/deploy/helm/kerno/Chart.yaml b/deploy/helm/kerno/Chart.yaml index cdc5d70..acb9530 100644 --- a/deploy/helm/kerno/Chart.yaml +++ b/deploy/helm/kerno/Chart.yaml @@ -21,3 +21,6 @@ annotations: artifacthub.io/signKey: "" artifacthub.io/containsSecurityUpdates: "false" artifacthub.io/prerelease: "false" + # artifacthub.io/maintainers annotation intentionally omitted: + # Artifact Hub reads the standard maintainers: field above automatically. + # icon: intentionally omitted until a proper static PNG/SVG logo is available. diff --git a/deploy/helm/kerno/README.md b/deploy/helm/kerno/README.md index 49409b5..ba56c31 100644 --- a/deploy/helm/kerno/README.md +++ b/deploy/helm/kerno/README.md @@ -11,7 +11,7 @@ Kerno is an eBPF-based kernel observability engine for Kubernetes. It diagnoses ## Installation ```bash -helm repo add kerno https://optiqor.github.io/kerno +helm repo add kerno https://optiqor.github.io/kerno-charts helm repo update helm install kerno kerno/kerno -n kerno-system --create-namespace ```