diff --git a/README.md b/README.md index 3931482..bc35cc8 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ This repository contains a **working MVP**: - Spend dashboard today -![Spend dashboard today](screenshots/Spend_Dashboard_today.png) +![Spend dashboard today](screenshots/Spend_Dashboard_Today.png) - Spend dashboard 7 days @@ -211,7 +211,7 @@ See `ARCHITECTURE.md`. ## Prerequisites -- Node.js 20.x +- Node.js 20.x–24.x (Node 26+ not yet supported) - pnpm (Corepack is fine) ## Setup @@ -400,6 +400,44 @@ On session **stop** and/or **exit**, Agents Fleet can capture a git snapshot for - Storage: `session_artifacts` table. - Toggle: set `AGENTS_FLEET_CAPTURE_GIT_ON_END=0` to disable capture. +## Resource metrics + +Agents Fleet ships a `scripts/metrics` script that captures CPU, memory, swap, load, network I/O, disk I/O, open file descriptors, and SQLite DB size — scoped to AgentFleet processes only. + +```bash +./scripts/metrics # one-shot pretty print +./scripts/metrics --watch # live refresh every 3s +./scripts/metrics --log # continuous CSV log to data/metrics_.csv (every 5s) +``` + +To tail the log live while a session runs: +```bash +# Terminal 1 +./scripts/metrics --log + +# Terminal 2 +tail -f data/metrics_.csv +``` + +**Measured profile (Apple M4 Pro, 24GB RAM):** + +| Scenario | CPU% | Memory | +|---|---|---| +| Idle (server + vite only) | 0–1% | ~165MB | +| Claude Shell session active | 1–4% | ~788MB | +| Claude SDK tool calls firing | 3–17% | 600–665MB | +| Git diff capture on stop | 12–47% | spike, clears fast | +| LiteLLM / Spend Analytics | 0–3% | ~165MB | + +- Baseline footprint is tiny — 165MB, <1% CPU when no agent is running +- Memory is almost entirely the Claude/Codex process itself, not AgentFleet overhead +- Git diff capture is the heaviest single event (~12% typical, up to 47% if multiple sessions stop together) — lasts <10s and clears cleanly +- No memory leaks observed — memory returns to baseline after every session exits +- Swap usage unchanged throughout — AgentFleet does not add swap pressure +- Data dir grows ~3MB per SDK session — worth monitoring on frequent use + +> GPU utilization is not captured without `sudo`. On Apple Silicon (unified memory) run `sudo powermetrics --samplers gpu_power` separately if needed. + ## Scripts - `pnpm dev:one` installs deps if needed and runs dev for all workspaces (web + server). - `pnpm dev` runs dev for all workspaces (web + server) in parallel. @@ -414,6 +452,7 @@ COREPACK_HOME="$PWD/.corepack" pnpm -C apps/server test ## Notes - If you see Corepack cache permission errors, the `COREPACK_HOME="$PWD/.corepack"` prefix keeps Corepack’s cache inside the repo. +- **Node version:** Node 20–24 are supported. Node 26+ is blocked by `@homebridge/node-pty-prebuilt-multiarch` (`>=18 <25`). Node 22 and 24 work fine. ## Data location - SQLite DB: `data/agents_fleet.sqlite` (local only; do not commit). diff --git a/ROADMAP.md b/ROADMAP.md index e859cbd..faade18 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -10,8 +10,8 @@ ## Next (Agent mission control) 1. **Multiple sessions management** — Batch-stop, group by repo, launch parallel sessions from the UI. Core to the "fleet" value prop. 2. **Per-session artifacts UX** — View/export bundle (diff, changed files list, PTY replay export). -3. **Model pricing configurable** — env/JSON override instead of hardcoded table. -4. **Budget accuracy hardening** — Model-specific pricing and SDK-reported usage everywhere; add tests. +3. **Budget accuracy hardening** — Model-specific pricing and SDK-reported usage everywhere; add tests. +4. **System resource monitoring** — CPU, memory, and GPU usage live in the UI via `systeminformation`. Per-session PID tracking to show exactly what each agent consumes. ## Later - Paste/attachments in Claude (SDK) and LiteLLM chat (images, files via Anthropic Files API). diff --git a/scripts/metrics b/scripts/metrics new file mode 100755 index 0000000..a3e14cd --- /dev/null +++ b/scripts/metrics @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# Agents Fleet — resource metrics snapshot +# Usage: ./scripts/metrics (pretty print, one-shot) +# ./scripts/metrics --watch (refresh every 3s) +# ./scripts/metrics --csv (CSV output, one-shot) +# ./scripts/metrics --log (continuous CSV logging to data/metrics_.csv, every 5s) + +set -euo pipefail + +WATCH=false +CSV=false +LOG=false +for arg in "$@"; do + case $arg in + --watch) WATCH=true ;; + --csv) CSV=true ;; + --log) LOG=true; CSV=true ;; + esac +done + +REPO_ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +PATTERNS=( + "tsx.*${REPO_ROOT}/apps/server" + "node.*${REPO_ROOT}/apps/server" + "${REPO_ROOT}.*vite" + "claude" + "codex" +) + +header() { + printf "%-6s %-40s %6s %8s %8s\n" "PID" "PROCESS" "CPU%" "MEM(MB)" "THREADS" + printf "%-6s %-40s %6s %8s %8s\n" "------" "----------------------------------------" "------" "--------" "-------" +} + +snapshot() { + local timestamp + timestamp=$(date '+%Y-%m-%d %H:%M:%S') + + # Collect matching PIDs + local pids=() + for pattern in "${PATTERNS[@]}"; do + while IFS= read -r pid; do + [[ -n "$pid" ]] && pids+=("$pid") + done < <(pgrep -f "$pattern" 2>/dev/null || true) + done + + local unique_pids + unique_pids=$(printf '%s\n' "${pids[@]}" | sort -u) + + if [[ -z "$unique_pids" ]]; then + $CSV || echo "[$timestamp] No Agents Fleet processes found. Is it running? (pnpm dev:one)" + return + fi + + # Per-process stats + local total_cpu=0 total_mem=0 + local proc_lines="" + + while IFS= read -r pid; do + [[ -z "$pid" ]] && continue + local stats + stats=$(ps -p "$pid" -o pid=,pcpu=,rss=,comm= 2>/dev/null || true) + [[ -z "$stats" ]] && continue + local p_pid p_cpu p_rss p_comm p_threads p_mem p_name + read -r p_pid p_cpu p_rss p_comm <<< "$stats" + p_threads=$(ps -M "$pid" 2>/dev/null | tail -n +2 | wc -l | tr -d ' ' || echo "?") + p_mem=$(awk "BEGIN {printf \"%.1f\", $p_rss/1024}") + p_name=$(ps -p "$pid" -o args= 2>/dev/null | sed 's|.*/||' | cut -c1-40 || echo "$p_comm") + total_cpu=$(awk "BEGIN {printf \"%.1f\", $total_cpu + $p_cpu}") + total_mem=$(awk "BEGIN {printf \"%.1f\", $total_mem + $p_mem}") + proc_lines+=$(printf "%-6s %-40s %6s %8s %8s\n" "$p_pid" "$p_name" "$p_cpu" "$p_mem" "$p_threads")$'\n' + done <<< "$unique_pids" + + # System metrics + local sys_mem_total sys_mem_used swap_total swap_used load net_in net_out disk_read disk_write proc_total proc_running gpu_info + sys_mem_total=$(sysctl -n hw.memsize 2>/dev/null | awk '{printf "%.0f", $1/1024/1024}' || echo "?") + sys_mem_used=$(vm_stat 2>/dev/null | awk ' + /Pages active/ { active=$3 } + /Pages wired down/ { wired=$4 } + /Pages occupied by compressor/ { compressed=$5 } + END { printf "%.0f", (active+wired+compressed)*4096/1024/1024 } + ' | tr -d '.' || echo "?") + swap_total=$(sysctl -n vm.swapusage 2>/dev/null | awk '{print $3}' | tr -d 'M' || echo "?") + swap_used=$(sysctl -n vm.swapusage 2>/dev/null | awk '{print $6}' | tr -d 'M' || echo "?") + load=$(sysctl -n vm.loadavg 2>/dev/null | awk '{print $2, $3, $4}' || uptime | awk -F'load averages:' '{print $2}') + net_in=$(netstat -ib 2>/dev/null | awk 'NR>1 && $1!~/lo/ && $5~/^[0-9]+$/ {sum+=$7} END {printf "%.1f", sum/1024/1024}') + net_out=$(netstat -ib 2>/dev/null | awk 'NR>1 && $1!~/lo/ && $5~/^[0-9]+$/ {sum+=$10} END {printf "%.1f", sum/1024/1024}') + disk_read=$(iostat -d 2>/dev/null | awk 'NR==3{printf "%.1f", $3}' || echo "?") + disk_write=$(iostat -d 2>/dev/null | awk 'NR==3{printf "%.1f", $4}' || echo "?") + proc_total=$(ps aux 2>/dev/null | wc -l | tr -d ' ') + proc_running=$(ps aux 2>/dev/null | awk '$8=="R"' | wc -l | tr -d ' ') + gpu_info=$(system_profiler SPDisplaysDataType 2>/dev/null | awk '/Chipset Model/{print $3,$4,$5}' | head -1 || echo "unavailable") + + if $CSV; then + # One summary row per snapshot + local load1 load5 load15 + read -r load1 load5 load15 <<< "$load" + local db_mb data_mb + db_mb=$(du -sk "${REPO_ROOT}/data/agents_fleet.sqlite" 2>/dev/null | awk '{printf "%.1f", $1/1024}' || echo "0") + data_mb=$(du -sk "${REPO_ROOT}/data/" 2>/dev/null | awk '{printf "%.1f", $1/1024}' || echo "0") + echo "$timestamp,$total_cpu,$total_mem,$sys_mem_used,$sys_mem_total,$swap_used,$swap_total,$load1,$load5,$load15,$proc_total,$proc_running,$net_in,$net_out,$disk_read,$disk_write,$db_mb,$data_mb" + else + echo "[$timestamp]" + header + printf "%s" "$proc_lines" + printf "%-6s %-40s %6s %8s\n" "------" "----------------------------------------" "------" "--------" + printf "%-6s %-40s %6s %8s\n" "TOTAL" "" "$total_cpu%" "${total_mem}MB" + + echo "" + echo "━━━ Per-process detail ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + while IFS= read -r pid; do + [[ -z "$pid" ]] && continue + local p_cmd fd_count net_count + p_cmd=$(ps -p "$pid" -o comm= 2>/dev/null | sed 's|.*/||' || echo "?") + fd_count=$(lsof -p "$pid" 2>/dev/null | wc -l | tr -d ' ') + net_count=$(lsof -p "$pid" -i 2>/dev/null | grep -c "ESTABLISHED\|LISTEN" || echo "0") + printf "PID %-6s %-20s open FDs: %-6s network connections: %s\n" "$pid" "$p_cmd" "$fd_count" "$net_count" + done <<< "$unique_pids" + + echo "" + echo "━━━ AI Watchtower data ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + printf "SQLite DB: %s\n" "$(du -sh "${REPO_ROOT}/data/agents_fleet.sqlite" 2>/dev/null | awk '{print $1}' || echo 'not found')" + printf "Data dir: %s total\n" "$(du -sh "${REPO_ROOT}/data/" 2>/dev/null | awk '{print $1}' || echo 'not found')" + + echo "" + echo "━━━ System ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + printf "RAM: %sMB used / %sMB total\n" "$sys_mem_used" "$sys_mem_total" + printf "Swap: %sMB used / %sMB total\n" "$swap_used" "$swap_total" + printf "Load avg: %s (1m / 5m / 15m)\n" "$load" + printf "Processes: %s total, %s running\n" "$proc_total" "$proc_running" + + echo "" + echo "━━━ I/O (since boot) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + printf "Network: %.1fMB in / %.1fMB out\n" "$net_in" "$net_out" + printf "Disk: %sMB/s read / %sMB/s write\n" "$disk_read" "$disk_write" + + echo "" + echo "━━━ GPU ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + printf "Chip: %s (unified memory)\n" "$gpu_info" + printf "Note: GPU utilization requires: sudo powermetrics --samplers gpu_power\n" + fi +} + +if $LOG; then + LOG_FILE="${REPO_ROOT}/data/metrics_$(date +%Y%m%d_%H%M%S).csv" + echo "timestamp,total_cpu_pct,total_mem_mb,ram_used_mb,ram_total_mb,swap_used_mb,swap_total_mb,load_1m,load_5m,load_15m,procs_total,procs_running,net_in_mb,net_out_mb,disk_read_mbs,disk_write_mbs,db_size_mb,data_dir_mb" > "$LOG_FILE" + echo "Logging to $LOG_FILE (every 5s) — Ctrl+C to stop" + echo "Tip: tail -f $LOG_FILE" + while true; do + snapshot >> "$LOG_FILE" + sleep 5 + done +elif $WATCH; then + while true; do + clear + snapshot + sleep 3 + done +else + snapshot +fi