From bc559c7f524d6435a93fca0007a6b79c6d67a0c6 Mon Sep 17 00:00:00 2001
From: donghyeon shin <donghyun4591@gmail.com>
Date: Tue, 20 Jan 2026 20:21:22 +0900
Subject: [PATCH 1/2] feat: add Alertmanager service to Docker Compose
 configuration

---
 infra/docker/docker-compose.local.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/infra/docker/docker-compose.local.yml b/infra/docker/docker-compose.local.yml
index 594b633..d9035de 100644
--- a/infra/docker/docker-compose.local.yml
+++ b/infra/docker/docker-compose.local.yml
@@ -78,7 +78,7 @@ services:
       - '--config.file=/etc/alertmanager/alertmanager.yml'
       - '--storage.path=/alertmanager'
     volumes:
-      - ../alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
+      - ../alertmanager:/etc/alertmanager
       - alertmanager_data:/alertmanager
     ports:
       - "9093:9093"

From 409e7084897fe770b3ffb5dbb29fa066087d43af Mon Sep 17 00:00:00 2001
From: donghyeon shin <donghyun4591@gmail.com>
Date: Fri, 23 Jan 2026 19:07:09 +0900
Subject: [PATCH 2/2] feat: add initial CLAUDE.md documentation and settings
 configuration

---
 .claude/settings.local.json |  48 +++++
 .gitignore                  |   5 -
 CLAUDE.md                   | 348 ++++++++++++++++++++++++++++++++++++
 3 files changed, 396 insertions(+), 5 deletions(-)
 create mode 100644 .claude/settings.local.json
 create mode 100644 CLAUDE.md

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..4f7873c
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,48 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(mkdir:*)",
+      "Bash(find:*)",
+      "Bash(pkill:*)",
+      "Bash(lsof:*)",
+      "Bash(curl:*)",
+      "Bash(netstat:*)",
+      "Bash(ss:*)",
+      "Bash(npm install:*)",
+      "Bash(sidebar_temp.tsx)",
+      "Bash(cat:*)",
+      "Bash(npm run dev:*)",
+      "Bash(docker compose:*)",
+      "Bash(docker exec:*)",
+      "Bash(git add:*)",
+      "Bash(xargs kill -9)",
+      "Bash(python3:*)",
+      "Bash(docker logs:*)",
+      "Bash(chmod:*)",
+      "Bash(docker volume rm:*)",
+      "Bash(docker stop:*)",
+      "Bash(docker rm:*)",
+      "Bash(docker volume:*)",
+      "Bash(xargs:*)",
+      "Bash(docker ps:*)",
+      "Bash(for:*)",
+      "Bash(do echo \"=== $tag ===\")",
+      "Bash(done)",
+      "Bash(ls:*)",
+      "Bash(do)",
+      "Bash(echo:*)",
+      "Bash(bash scripts/test-v0.6.0.sh:*)",
+      "Bash(dos2unix:*)",
+      "Bash(gh pr list:*)",
+      "Bash(git show-ref:*)",
+      "Bash(git tag:*)",
+      "Bash(git pull:*)",
+      "WebSearch",
+      "Bash(git checkout:*)",
+      "Bash(git reset:*)",
+      "Bash(git commit:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/.gitignore b/.gitignore
index d135201..0b34e84 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,11 +28,6 @@ venv/
 # Local env configs
 configs/env/.env.local
 
-# Claude
-CLAUDE.md
-.claude/settings.local.json
-
-
 # node
 node_modules/
 .next/
\ No newline at end of file
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..36b9c10
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,348 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+LLM-Quality-Observer is a microservices-based MLOps platform for monitoring and evaluating LLM response quality. The system logs LLM interactions, evaluates them using rule-based and LLM-as-a-judge approaches, and provides dashboards for visualization and monitoring.
+
+Current status: v0.5.0 with Gateway API + Evaluator + Dashboard + Prometheus + Grafana operational.
+
+## Architecture
+
+Full-stack microservices architecture with monitoring:
+
+```
+Client → Gateway API → Postgres ← Evaluator Service
+            ↓             ↓            ↓
+         Dashboard    Prometheus → Grafana
+```
+
+- **Gateway API** (port 18000): FastAPI service that receives chat requests, calls OpenAI GPT-5 mini, logs to database, exposes Prometheus metrics
+- **Evaluator Service** (port 18001): Batch evaluation service that scores LLM outputs using rule-based and LLM-as-a-judge methods, sends notifications (Slack/Discord/Email), exposes Prometheus metrics
+- **Dashboard Service** (port 8501): Streamlit UI for visualizing quality metrics, latency distributions, and error rates
+- **Postgres** (port 5432): PostgreSQL 16 database with `llm_logs` and `llm_evaluations` tables
+- **Prometheus** (port 9090): Metrics collection and time-series database
+- **Grafana** (port 3000): Monitoring dashboards and visualization platform
+
+## Common Commands
+
+### Local Development (Docker)
+
+```bash
+# Start all services
+cd infra/docker
+docker compose -f docker-compose.local.yml up --build
+
+# Start specific service
+docker compose -f docker-compose.local.yml up gateway-api --build
+
+# Stop all services
+docker compose -f docker-compose.local.yml down
+
+# View logs
+docker compose -f docker-compose.local.yml logs -f gateway-api
+docker compose -f docker-compose.local.yml logs -f evaluator
+```
+
+### Database Operations
+
+```bash
+# Connect to Postgres
+docker exec -it llm-postgres psql -U llm_user -d llm_quality
+
+# View recent logs
+docker exec -it llm-postgres psql -U llm_user -d llm_quality -c "SELECT id, created_at, user_id, LEFT(prompt, 50) AS prompt_snippet, model_version, latency_ms, status FROM llm_logs ORDER BY id DESC LIMIT 10;"
+
+# View evaluations
+docker exec -it llm-postgres psql -U llm_user -d llm_quality -c "SELECT id, log_id, score_overall, score_instruction_following, score_truthfulness, judge_model FROM llm_evaluations ORDER BY id DESC LIMIT 10;"
+```
+
+### Testing Services
+
+```bash
+# Test Gateway API health
+curl http://localhost:18000/health
+
+# Test Gateway API chat endpoint
+curl -X POST "http://localhost:18000/chat" \
+  -H "Content-Type: application/json" \
+  -d '{"prompt": "What is Python?", "user_id": "test-user", "model_version": null}'
+
+# Test Evaluator health
+curl http://localhost:18001/health
+
+# Trigger evaluation batch
+curl -X POST "http://localhost:18001/evaluate-once?limit=5"
+
+# View Dashboard
+# Open browser to http://localhost:8501
+
+# View Prometheus metrics (Gateway)
+curl http://localhost:18000/metrics
+
+# View Prometheus metrics (Evaluator)
+curl http://localhost:18001/metrics
+
+# View Prometheus UI
+# Open browser to http://localhost:9090
+
+# View Grafana Dashboard
+# Open browser to http://localhost:3000 (admin/admin)
+```
+
+### Dependency Management
+
+Each service uses `uv` for dependency management:
+
+```bash
+# Install dependencies for a service
+cd services/gateway-api
+uv sync
+
+# Add a new dependency
+cd services/gateway-api
+uv add <package-name>
+
+# Update dependencies
+uv sync --upgrade
+```
+
+## Key Technical Details
+
+### Gateway API Service (services/gateway-api)
+
+**Entry point**: `app/main.py`
+- `/health`: Health check endpoint
+- `/chat`: Main LLM endpoint that accepts ChatRequest and returns ChatResponse
+- `/metrics`: Prometheus metrics endpoint
+
+**LLM Client** (`app/llm_client.py`):
+- Uses OpenAI Python SDK's `client.responses.create()` API (not the standard chat completions API)
+- Model resolution: Falls back to `OPENAI_MODEL_MAIN` env var if no model specified
+- Returns tuple of `(response_text, latency_ms)`
+- Timing measured using `time.perf_counter()`
+
+**Database** (`app/db.py`, `app/models.py`):
+- SQLAlchemy ORM with `LLMLog` model
+- Tables auto-created on startup via `Base.metadata.create_all(bind=engine)`
+- Fields: id, created_at, user_id, prompt, response, model_version, latency_ms, status
+
+**Configuration** (`app/config.py`):
+- Pydantic Settings loading from environment variables
+- Required: `DATABASE_URL`, `OPENAI_MODEL_MAIN`, `LLM_API_KEY`
+- Optional: `LLM_API_BASE_URL`, `LOG_LEVEL`, `APP_ENV`
+
+**Metrics** (`app/metrics.py`):
+- Prometheus client integration for observability
+- HTTP request metrics: rate, latency (p50/p95/p99), status codes
+- LLM request metrics: call rate, latency by model, success/error tracking
+- Database metrics: query rate, latency by operation and table
+- Middleware automatically captures HTTP request metrics
+
+### Evaluator Service (services/evaluator)
+
+**Entry point**: `app/main.py`
+- `/health`: Health check endpoint
+- `/evaluate-once`: Batch evaluation endpoint (processes up to N unevaluated logs)
+- `/metrics`: Prometheus metrics endpoint
+
+**Evaluation Logic**:
+- `app/rules.py`: Rule-based evaluation (length checks, keyword detection)
+- `app/llm_judge.py`: LLM-as-a-judge evaluation using GPT-4 or similar
+- Combines scores from both approaches
+- Creates `LLMEvaluation` records with scores and comments
+
+**Scheduler** (`app/scheduler.py`):
+- APScheduler for automated batch evaluation
+- Configurable interval (default: 60 minutes)
+- Processes pending logs in batches
+- Records metrics for each evaluation run
+
+**Notifications** (`app/notifier.py`):
+- Multi-channel notification system: Slack, Discord, Email
+- Low-quality alerts when score falls below threshold
+- Batch evaluation summaries
+- SMTP integration for email (via aiosmtplib)
+
+**Database** (`app/models.py`):
+- Reuses `LLMLog` model from shared schema
+- New `LLMEvaluation` model: log_id (FK), score_overall, score_instruction_following, score_truthfulness, comments, judge_model, raw_judge_response
+
+**Configuration** (`app/config.py`):
+- Uses same `DATABASE_URL` as gateway
+- LLM Judge: `OPENAI_MODEL_JUDGE` for the judge model
+- Scheduler: `ENABLE_AUTO_EVALUATION`, `EVALUATION_INTERVAL_MINUTES`, `EVALUATION_BATCH_SIZE`, `EVALUATION_JUDGE_TYPE`
+- Notifications: `SLACK_WEBHOOK_URL`, `DISCORD_WEBHOOK_URL`, `NOTIFICATION_SCORE_THRESHOLD`
+- Email: `SMTP_HOST`, `SMTP_PORT`, `SMTP_USERNAME`, `SMTP_PASSWORD`, `SMTP_FROM_EMAIL`, `SMTP_TO_EMAILS`
+
+**Metrics** (`app/metrics.py`):
+- Evaluation metrics: rate, duration, score distribution
+- Batch evaluation metrics: runs, logs processed
+- Notification metrics: sent count by channel and status
+- Scheduler metrics: run count, pending logs gauge
+- LLM judge metrics: request rate and latency
+
+### Dashboard Service (services/dashboard)
+
+**Entry point**: `app/main.py`
+- Streamlit application
+- Connects to same Postgres database
+- Visualizes metrics from `llm_logs` and `llm_evaluations` tables
+
+**Pages/Views**:
+- Overview: Summary statistics, recent requests
+- Quality Metrics: Score distributions, trends over time
+- Latency Analysis: p50/p95/p99 latencies by model
+- Model Comparison: Side-by-side model performance
+
+### Prometheus (Monitoring)
+
+**Configuration**: `infra/prometheus/prometheus.yml`
+- Metrics collection from Gateway API and Evaluator services
+- Scrape interval: 15 seconds
+- Targets: gateway-api:8000, evaluator:8000
+- Web UI accessible at http://localhost:9090
+
+**Scrape Targets**:
+- `gateway-api`: Collects HTTP, LLM, and database metrics
+- `evaluator`: Collects evaluation, notification, and scheduler metrics
+- `prometheus`: Self-monitoring
+
+### Grafana (Visualization)
+
+**Configuration**: `infra/grafana/provisioning/`
+- Auto-provisioned Prometheus datasource
+- Pre-configured LLM Quality Observer dashboard
+- Dashboard JSON: `infra/grafana/dashboards/llm-quality-observer.json`
+- Web UI accessible at http://localhost:3000 (admin/admin)
+
+**Dashboard Panels** (14 panels total):
+- Overview stats: request rate, evaluation rate, pending logs, notification rate
+- HTTP performance: request distribution, latency percentiles
+- LLM metrics: requests by model, latency analysis
+- Quality scores: score distribution by judge type
+- Notifications: delivery rates, alert tracking
+- System health: scheduler runs, batch processing
+
+### Environment Configuration
+
+Environment variables are managed through `.env.local` file in `configs/env/`:
+
+```env
+APP_ENV=local
+LOG_LEVEL=DEBUG
+OPENAI_MODEL_MAIN=gpt-5-mini
+OPENAI_MODEL_JUDGE=gpt-4o-mini
+LLM_API_BASE_URL=https://api.openai.com/v1
+LLM_API_KEY=sk-...
+DATABASE_URL=postgresql://llm_user:llm_password@postgres:5432/llm_quality
+
+# Batch Evaluation Scheduler (v0.4.0+)
+ENABLE_AUTO_EVALUATION=true
+EVALUATION_INTERVAL_MINUTES=60
+EVALUATION_BATCH_SIZE=10
+EVALUATION_JUDGE_TYPE=rule
+
+# Notification Settings (v0.4.0+)
+SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL
+DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR/WEBHOOK/URL
+NOTIFICATION_SCORE_THRESHOLD=3
+
+# Email Notification Settings (v0.5.0+)
+SMTP_HOST=smtp.gmail.com
+SMTP_PORT=587
+SMTP_USERNAME=your-email@gmail.com
+SMTP_PASSWORD=your-app-password
+SMTP_FROM_EMAIL=your-email@gmail.com
+SMTP_TO_EMAILS=recipient1@example.com,recipient2@example.com
+```
+
+**Important**: `.env.local` is gitignored. Each developer must create their own from the template.
+
+### Docker Configuration
+
+All services use similar Dockerfile pattern:
+1. Python 3.12-slim base image
+2. Install system dependencies (build-essential, libpq-dev for psycopg2)
+3. Install `uv` package manager
+4. Copy `pyproject.toml` and run `uv sync --no-dev`
+5. Copy application code
+6. Run with `uv run uvicorn` or `uv run streamlit`
+
+Docker Compose (`infra/docker/docker-compose.local.yml`):
+- Defines 6 services: postgres, gateway-api, evaluator, dashboard, prometheus, grafana
+- Application services depend on postgres
+- Prometheus depends on gateway-api and evaluator
+- Grafana depends on prometheus
+- Application services mount `configs/env/.env.local` as env_file
+- Exposed ports:
+  - Gateway API: 18000
+  - Evaluator: 18001
+  - Dashboard: 8501
+  - Prometheus: 9090
+  - Grafana: 3000
+  - Postgres: 5432
+
+## Development Workflow
+
+1. **Adding a new feature to Gateway API**:
+   - Modify `services/gateway-api/app/main.py` for new endpoints
+   - Update `app/schemas.py` for new request/response models
+   - Update `app/models.py` if database schema changes
+   - Rebuild: `docker compose -f docker-compose.local.yml up gateway-api --build`
+
+2. **Adding new evaluation criteria**:
+   - Update `services/evaluator/app/rules.py` for rule-based checks
+   - Update `services/evaluator/app/llm_judge.py` for judge prompt changes
+   - Modify `app/models.py` if new score fields needed
+   - Rebuild: `docker compose -f docker-compose.local.yml up evaluator --build`
+
+3. **Adding new dashboard visualizations**:
+   - Modify `services/dashboard/app/main.py`
+   - Use Streamlit components (st.metric, st.line_chart, st.dataframe)
+   - Query data from `llm_logs` or `llm_evaluations` tables
+   - Rebuild: `docker compose -f docker-compose.local.yml up dashboard --build`
+
+## Important Notes
+
+- **OpenAI API**: Gateway uses `client.responses.create()` not `client.chat.completions.create()`. This is specific to GPT-5 mini's API interface.
+- **Model Resolution**: If request doesn't specify model or uses placeholder "string", falls back to `OPENAI_MODEL_MAIN` from env.
+- **Database Initialization**: Tables are auto-created on first run via SQLAlchemy's `create_all()`. No manual migration needed for local development.
+- **Automated Evaluation** (v0.4.0+): Scheduler runs automatically when `ENABLE_AUTO_EVALUATION=true`. Configure interval with `EVALUATION_INTERVAL_MINUTES`.
+- **Notifications** (v0.4.0+): Low-quality alerts sent when score ≤ `NOTIFICATION_SCORE_THRESHOLD`. Supports Slack, Discord, and Email (v0.5.0+).
+- **Metrics** (v0.5.0+): All services expose `/metrics` endpoint for Prometheus scraping. Grafana dashboard auto-provisioned at startup.
+- **Error Handling**: LLM failures are logged with status="error" but not evaluated. Only status="success" logs are processed by evaluator.
+
+## Service Dependencies
+
+```
+gateway-api:
+- fastapi, uvicorn
+- sqlalchemy, psycopg2-binary
+- pydantic, pydantic-settings
+- openai
+- httpx, python-dotenv
+- prometheus-client (v0.5.0+)
+
+evaluator:
+- fastapi, uvicorn
+- sqlalchemy, psycopg2-binary
+- pydantic-settings
+- openai
+- apscheduler (v0.4.0+)
+- httpx (v0.4.0+ for notifications)
+- prometheus-client (v0.5.0+)
+- aiosmtplib, email-validator (v0.5.0+ for email)
+
+dashboard:
+- streamlit
+- sqlalchemy, psycopg2-binary
+- pandas, plotly (for visualizations)
+```
+
+All managed via `uv` and defined in each service's `pyproject.toml`.
+
+**Infrastructure Services** (v0.5.0+):
+- **Prometheus**: Official prom/prometheus Docker image
+- **Grafana**: Official grafana/grafana Docker image