diff --git a/.github/workflows/sandbox-verification.yml b/.github/workflows/sandbox-verification.yml new file mode 100644 index 0000000..7c18979 --- /dev/null +++ b/.github/workflows/sandbox-verification.yml @@ -0,0 +1,308 @@ +name: Sandbox Masking Verification + +# Runs the full sandboxed Docker verification suite to prove that OpenDataMask +# correctly anonymises PII while preserving referential integrity. +# +# What it does +# ──────────── +# 1. Builds the backend image from source (with Docker layer caching). +# 2. Starts source_db, target_db, app_db, and backend via docker compose. +# 3. Orchestrates a masking job through the REST API (workspace → connections +# → table config → column generators → job → poll to completion). +# 4. Runs verify.py to perform four automated checks and writes a JUnit XML +# report that is published as a workflow check and uploaded as an artifact. +# 5. Always tears down containers and uploads Docker logs on failure. +# +# Triggers +# ──────── +# • Every push / PR to main. +# • Manual dispatch from the Actions UI (workflow_dispatch). + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +jobs: + sandbox-verification: + name: Sandbox PII Masking Verification + runs-on: ubuntu-latest + timeout-minutes: 30 + + permissions: + contents: read + checks: write # required by dorny/test-reporter to publish check results + + env: + # Sandbox-only secrets — safe to inline here; never reuse in production. + ODM_JWT_SECRET: odm-verification-jwt-secret-sandbox-not-for-production-use-xyz + ODM_ENCRYPTION_KEY: odm-verify-enc-key-sandbox-only + SOURCE_DB_NAME: source_db + SOURCE_DB_USER: source_user + SOURCE_DB_PASS: source_pass + TARGET_DB_NAME: target_db + TARGET_DB_USER: target_user + TARGET_DB_PASS: target_pass + API_BASE: http://localhost:8080 + ODM_USER: verifier + ODM_PASS: "Verif1cation!Pass" + ODM_EMAIL: verifier@odm-sandbox.local + JUNIT_XML: verification-report.xml + + steps: + # ── Checkout ────────────────────────────────────────────────────────── + - name: Checkout + uses: actions/checkout@v4 + + # ── Python (for verify.py) ──────────────────────────────────────────── + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: pip + cache-dependency-path: verification/requirements.txt + + - name: Install Python dependencies + run: python3 -m pip install -q -r verification/requirements.txt + + # ── Docker build cache ──────────────────────────────────────────────── + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # ── Start Docker Compose sandbox (no frontend needed for API tests) ─── + - name: Start sandbox services + working-directory: verification + run: | + docker compose up -d --build \ + source_db target_db app_db backend + + # ── Wait for backend to be healthy ──────────────────────────────────── + - name: Wait for backend health + timeout-minutes: 10 + run: | + echo "Waiting for backend to report UP..." + for i in $(seq 1 120); do + STATUS=$(curl -s "${API_BASE}/actuator/health" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('status',''))" \ + 2>/dev/null || true) + if [ "${STATUS}" = "UP" ]; then + echo "✅ Backend is healthy." + exit 0 + fi + echo " Attempt ${i}/120: status='${STATUS}' — retrying in 5s..." + sleep 5 + done + echo "::error::Backend did not become healthy within 10 minutes." + exit 1 + + # ── Register user ───────────────────────────────────────────────────── + - name: Register ODM user + run: | + curl -sf -X POST "${API_BASE}/api/auth/register" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${ODM_USER}\",\"email\":\"${ODM_EMAIL}\",\"password\":\"${ODM_PASS}\"}" \ + > /dev/null 2>&1 || true # continue if user already exists + + # ── Login & capture token ───────────────────────────────────────────── + - name: Login and obtain JWT + run: | + LOGIN_RESP=$(curl -sf -X POST "${API_BASE}/api/auth/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${ODM_USER}\",\"password\":\"${ODM_PASS}\"}") + TOKEN=$(echo "${LOGIN_RESP}" \ + | python3 -c "import sys,json; print(json.load(sys.stdin).get('token',''))") + if [ -z "${TOKEN}" ]; then + echo "::error::Failed to obtain JWT token." + exit 1 + fi + echo "TOKEN=${TOKEN}" >> "$GITHUB_ENV" + echo "✅ Authenticated." + + # ── Create workspace ────────────────────────────────────────────────── + - name: Create verification workspace + run: | + WS_RESP=$(curl -sf -X POST "${API_BASE}/api/workspaces" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d '{"name":"Verification Workspace","description":"Automated PII masking verification"}') + WS_ID=$(echo "${WS_RESP}" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") + echo "WS_ID=${WS_ID}" >> "$GITHUB_ENV" + echo "✅ Workspace created: id=${WS_ID}" + + # ── Wire source connection ──────────────────────────────────────────── + - name: Create source connection + run: | + SRC_RESP=$(curl -sf -X POST "${API_BASE}/api/workspaces/${WS_ID}/connections" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d "{\"name\":\"source-db\",\"type\":\"POSTGRESQL\", + \"connectionString\":\"jdbc:postgresql://source_db:5432/${SOURCE_DB_NAME}\", + \"username\":\"${SOURCE_DB_USER}\",\"password\":\"${SOURCE_DB_PASS}\", + \"isSource\":true,\"isDestination\":false}") + SRC_CONN_ID=$(echo "${SRC_RESP}" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") + echo "SRC_CONN_ID=${SRC_CONN_ID}" >> "$GITHUB_ENV" + echo "✅ Source connection: id=${SRC_CONN_ID}" + + # ── Wire destination connection ─────────────────────────────────────── + - name: Create destination connection + run: | + DST_RESP=$(curl -sf -X POST "${API_BASE}/api/workspaces/${WS_ID}/connections" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d "{\"name\":\"target-db\",\"type\":\"POSTGRESQL\", + \"connectionString\":\"jdbc:postgresql://target_db:5432/${TARGET_DB_NAME}\", + \"username\":\"${TARGET_DB_USER}\",\"password\":\"${TARGET_DB_PASS}\", + \"isSource\":false,\"isDestination\":true}") + DST_CONN_ID=$(echo "${DST_RESP}" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") + echo "DST_CONN_ID=${DST_CONN_ID}" >> "$GITHUB_ENV" + echo "✅ Destination connection: id=${DST_CONN_ID}" + + # ── Configure table in MASK mode ────────────────────────────────────── + - name: Configure users table (MASK mode) + run: | + TABLE_RESP=$(curl -sf -X POST "${API_BASE}/api/workspaces/${WS_ID}/tables" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d '{"tableName":"users","mode":"MASK"}') + TABLE_ID=$(echo "${TABLE_RESP}" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") + echo "TABLE_ID=${TABLE_ID}" >> "$GITHUB_ENV" + echo "✅ Table config: id=${TABLE_ID}" + + # ── Add column generators ───────────────────────────────────────────── + - name: Add column generators + run: | + add_generator() { + local col="$1" gtype="$2" params="${3:-}" + # Build JSON via Python so generatorParams is a JSON *string* value + # (the backend field is String?, not an embedded object). + # sys.argv avoids shell-quoting issues with special characters. + if [ -z "${params}" ]; then + BODY=$(python3 -c " +import json, sys +print(json.dumps({'columnName': sys.argv[1], 'generatorType': sys.argv[2]})) +" -- "${col}" "${gtype}") + else + BODY=$(python3 -c " +import json, sys +print(json.dumps({'columnName': sys.argv[1], 'generatorType': sys.argv[2], 'generatorParams': sys.argv[3]})) +" -- "${col}" "${gtype}" "${params}") + fi + curl -sf -X POST "${API_BASE}/api/workspaces/${WS_ID}/tables/${TABLE_ID}/generators" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d "${BODY}" > /dev/null + echo " ✅ ${col} → ${gtype}" + } + add_generator "full_name" "FULL_NAME" + add_generator "email" "EMAIL" + add_generator "phone_number" "PHONE" + add_generator "date_of_birth" "BIRTH_DATE" + add_generator "salary" "RANDOM_INT" '{"min":"30000","max":"200000"}' + + # ── Trigger masking job ─────────────────────────────────────────────── + - name: Trigger masking job + run: | + JOB_RESP=$(curl -sf -X POST "${API_BASE}/api/workspaces/${WS_ID}/jobs" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN}" \ + -d '{}') + JOB_ID=$(echo "${JOB_RESP}" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") + echo "JOB_ID=${JOB_ID}" >> "$GITHUB_ENV" + echo "✅ Job started: id=${JOB_ID}" + + # ── Poll until job completes ────────────────────────────────────────── + - name: Wait for masking job to complete + timeout-minutes: 5 + run: | + echo "Polling job ${JOB_ID}..." + for i in $(seq 1 60); do + STATUS=$(curl -sf "${API_BASE}/api/workspaces/${WS_ID}/jobs/${JOB_ID}" \ + -H "Authorization: Bearer ${TOKEN}" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") + echo " [${i}/60] status=${STATUS}" + if [ "${STATUS}" = "COMPLETED" ]; then + echo "✅ Job completed." + exit 0 + elif [ "${STATUS}" = "FAILED" ] || [ "${STATUS}" = "CANCELLED" ]; then + echo "::error::Masking job ended with status=${STATUS}" + # Fetch job logs for debugging + curl -sf "${API_BASE}/api/workspaces/${WS_ID}/jobs/${JOB_ID}/logs" \ + -H "Authorization: Bearer ${TOKEN}" \ + | python3 -c " + import sys, json + for l in json.load(sys.stdin): + print(f'[{l[\"level\"]}] {l[\"message\"]}') + " || true + exit 1 + fi + sleep 5 + done + echo "::error::Job did not complete within the timeout." + exit 1 + + # ── Run verification checks (produces JUnit XML) ─────────────────────── + - name: Run verify.py + id: verify + run: | + python3 verification/verify.py --junit-xml "${JUNIT_XML}" + env: + SOURCE_DB_HOST: localhost + SOURCE_DB_PORT: "5433" + TARGET_DB_HOST: localhost + TARGET_DB_PORT: "5434" + + # ── Publish test report as a workflow check ─────────────────────────── + - name: Publish verification report + uses: dorny/test-reporter@v1 + if: always() + with: + name: Sandbox Masking Verification Results + path: ${{ env.JUNIT_XML }} + reporter: java-junit + fail-on-error: false + + # ── Upload JUnit XML as a downloadable artifact ─────────────────────── + - name: Upload verification report artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: sandbox-verification-report + path: ${{ env.JUNIT_XML }} + retention-days: 30 + + # ── Write job summary ───────────────────────────────────────────────── + - name: Write job summary + if: always() + run: | + echo "## Sandbox Masking Verification" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + if [ "${{ steps.verify.outcome }}" = "success" ]; then + echo "✅ **All verification checks passed.**" >> "$GITHUB_STEP_SUMMARY" + else + echo "❌ **One or more verification checks failed.** See the report for details." >> "$GITHUB_STEP_SUMMARY" + fi + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| Check | What it validates |" >> "$GITHUB_STEP_SUMMARY" + echo "|---|---|" >> "$GITHUB_STEP_SUMMARY" + echo "| Record Integrity | \`COUNT(*)\` matches across source and target |" >> "$GITHUB_STEP_SUMMARY" + echo "| Key Persistence | Every source UUID exists unchanged in target |" >> "$GITHUB_STEP_SUMMARY" + echo "| Masking Effectiveness | \`full_name\` and \`email\` differ for every matched row |" >> "$GITHUB_STEP_SUMMARY" + echo "| Human Readability | 5-record sample + heuristics (name has space, email has \`@\`) |" >> "$GITHUB_STEP_SUMMARY" + + # ── Collect container logs on failure (always run) ──────────────────── + - name: Collect Docker logs on failure + if: failure() + working-directory: verification + run: | + echo "=== backend logs ===" && docker compose logs backend || true + echo "=== app_db logs ===" && docker compose logs app_db || true + echo "=== source_db logs ===" && docker compose logs source_db || true + echo "=== target_db logs ===" && docker compose logs target_db || true + + # ── Tear down sandbox ───────────────────────────────────────────────── + - name: Tear down sandbox + if: always() + working-directory: verification + run: docker compose down --volumes --remove-orphans diff --git a/README.md b/README.md index 7385144..f9e8d4e 100644 --- a/README.md +++ b/README.md @@ -287,10 +287,36 @@ See [Deployment Guide](docs/user-guide.md#infrastructure--terraform-deployment) | Doc | Description | |-----|-------------| | [User Guide](docs/user-guide.md) | Setup, configuration, core concepts, CLI usage | +| [Verification Guide](verification/README.md) | Sandboxed end-to-end verification of masking correctness | | [Website](docs/website/index.html) | Static HTML/CSS project website | | [API Reference](docs/website/api.html) | Full REST API endpoint reference | | [Deployment Guide](docs/website/deployment.html) | Docker, Kubernetes, CI/CD, security | +## Sandbox Verification + +OpenDataMask ships with a self-contained Docker-based verification suite that proves the masking pipeline correctly anonymises PII while preserving referential integrity. + +```bash +cd verification/ +./run_verification.sh # build → start → configure → mask → verify + +# With JUnit XML output: +VERIFY_JUNIT_XML=report.xml ./run_verification.sh +``` + +Four automated checks are performed: + +| Check | What it validates | +|---|---| +| **Record Integrity** | `COUNT(*)` matches across source and target (fails if source is empty) | +| **Key Persistence** | Every source UUID exists unchanged in target | +| **Masking Effectiveness** | `full_name` and `email` differ for every matched row | +| **Human Readability** | 5-record sample + format heuristics; skipped (not failed) if masking didn't pass | + +The GitHub Actions workflow `.github/workflows/sandbox-verification.yml` runs this suite on every push/PR to `main` and publishes a JUnit report as a workflow check and downloadable artifact. + +See [verification/README.md](verification/README.md) for full details. + ## License Open source — see [LICENSE](LICENSE) for details. diff --git a/docs/user-guide.md b/docs/user-guide.md index 8a0b1dd..0dd71f1 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -358,6 +358,7 @@ CI (tests pass) | `.github/workflows/docker.yml` | Build and push images to GHCR | | `.github/workflows/deploy.yml` | **Full deploy pipeline** (terraform → docker → deploy → verify) | | `.github/workflows/verify-deployment.yml` | Spring Boot smoke tests + optional live server health check | +| `.github/workflows/sandbox-verification.yml` | **End-to-end masking verification** — proves PII masking correctness; publishes JUnit report | | `.github/workflows/codeql.yml` | Weekly security analysis | GitHub **Environments** (`staging`, `production`) are used for deployment tracking, enabling Copilot and the GitHub UI to display live deployment status, history, and URL. @@ -374,6 +375,60 @@ docker build -t opendatamask-frontend ./frontend --- +## Sandbox Verification Environment + +The `verification/` directory contains a self-contained Docker-based environment that automatically proves OpenDataMask correctly masks PII while preserving referential integrity. + +### Quick Start + +```bash +cd verification/ +chmod +x run_verification.sh +./run_verification.sh +``` + +The script builds images, starts all services, configures a masking job via the REST API, runs the job, and then validates the output. + +### What Gets Verified + +| Check | Description | +|---|---| +| **Record Integrity** | Source and target row counts must match; fails if source is empty | +| **Key Persistence** | Every source UUID primary key must exist unchanged in the target | +| **Masking Effectiveness** | `full_name` and `email` must differ for every matched row; fails if no rows were compared | +| **Human Readability** | Samples 5 masked records (ordered by `id`) and checks format heuristics; skipped (not failed) when masking didn't pass to avoid exposing potential PII | + +### JUnit XML Reports + +Both the script and the standalone Python verifier support JUnit XML output: + +```bash +# Via the orchestration script: +VERIFY_JUNIT_XML=report.xml ./run_verification.sh + +# Directly (when environment is already running): +python3 -m pip install -r requirements.txt +python3 verify.py --junit-xml report.xml +``` + +### GitHub Actions Integration + +`.github/workflows/sandbox-verification.yml` runs the full suite on every push and pull request to `main`. It publishes: +- A **workflow check** (per-check pass/fail annotations via `dorny/test-reporter`) +- A **downloadable artifact** (`sandbox-verification-report`, 30-day retention) +- A **markdown job summary** with overall pass/fail status + +### Teardown + +```bash +cd verification/ +docker compose -f docker-compose.yml down -v +``` + +See [verification/README.md](../verification/README.md) for the full reference, including all environment variable overrides. + +--- + ## Troubleshooting | Symptom | Likely Cause | Fix | diff --git a/verification/.env.example b/verification/.env.example new file mode 100644 index 0000000..bfc7fea --- /dev/null +++ b/verification/.env.example @@ -0,0 +1,23 @@ +# OpenDataMask Verification Sandbox — Environment Variables +# +# Copy this file to .env and edit as needed before running docker compose. +# These values are for the SANDBOX verification environment ONLY. +# Never reuse them in any non-sandbox environment. +# +# cp .env.example .env +# docker compose -f docker-compose.yml up -d + +# ── OpenDataMask backend secrets ───────────────────────────────────────────── +# Generate fresh values with: openssl rand -base64 32 +ODM_JWT_SECRET=odm-verification-jwt-secret-sandbox-not-for-production-use-xyz +ODM_ENCRYPTION_KEY=odm-verify-enc-key-sandbox-only + +# ── Source database credentials ─────────────────────────────────────────────── +SOURCE_DB_NAME=source_db +SOURCE_DB_USER=source_user +SOURCE_DB_PASS=source_pass + +# ── Target database credentials ─────────────────────────────────────────────── +TARGET_DB_NAME=target_db +TARGET_DB_USER=target_user +TARGET_DB_PASS=target_pass diff --git a/verification/.gitignore b/verification/.gitignore new file mode 100644 index 0000000..4405a68 --- /dev/null +++ b/verification/.gitignore @@ -0,0 +1,8 @@ +# Python byte-code cache +__pycache__/ +*.pyc +*.pyo + +# Runtime artefacts +*.xml +.env diff --git a/verification/README.md b/verification/README.md new file mode 100644 index 0000000..cb6d050 --- /dev/null +++ b/verification/README.md @@ -0,0 +1,203 @@ +# OpenDataMask — Sandboxed Verification Environment + +This directory contains a self-contained, Docker-based environment for +proving that OpenDataMask correctly masks sensitive PII data while preserving +referential integrity. + +## What It Does + +| Step | Description | +|------|-------------| +| **SOURCE_DB** | PostgreSQL database pre-seeded with 50 realistic user records (UUID PK, full_name, email, phone_number, date_of_birth, salary). | +| **TARGET_DB** | Empty PostgreSQL database that receives the masked data. | +| **Masking job** | OpenDataMask reads every row from SOURCE_DB, applies Datafaker-powered generators to all PII columns, and writes the anonymised rows to TARGET_DB — keeping the original UUID primary keys intact. | +| **Verification** | A Python script connects to both databases and validates: row counts, key persistence, masking effectiveness, and human-readability of the output. | + +## Directory Layout + +``` +verification/ +├── docker-compose.yml # SOURCE_DB, TARGET_DB, app_db, backend, frontend +├── init/ +│ └── source_db.sql # DDL + 50 seed records for SOURCE_DB +├── run_verification.sh # Full end-to-end orchestration script +├── verify.py # Python validation script +├── requirements.txt # Python dependencies (psycopg2-binary) +└── README.md # This file +``` + +## Prerequisites + +| Tool | Version | +|------|---------| +| Docker Engine | ≥ 24 | +| Docker Compose | v2 (`docker compose`) or v1 (`docker-compose`) | +| curl | any | +| Python 3 | ≥ 3.10 (must include `pip` module — standard in most distributions) | + +## Quick Start + +```bash +# Run from the repository root or the verification/ directory: +cd verification/ +chmod +x run_verification.sh +./run_verification.sh +``` + +The script will: + +1. Build the backend and frontend Docker images. +2. Start all services and wait for them to be healthy. +3. Register a user and authenticate with the OpenDataMask API. +4. Create a workspace, source & destination connections, table configuration, + and per-column masking generators. +5. Trigger a masking job and poll until it completes. +6. Run `verify.py` and print a Verification Report. + +## Running Only the Verification Script + +If the environment is already running and the masking job has already completed: + +```bash +python3 -m pip install -r requirements.txt +python3 verify.py +``` + +### JUnit XML Output + +Both the orchestration script and the standalone script support a JUnit-compatible XML report (no external dependencies — uses stdlib `xml.etree.ElementTree`): + +```bash +# Via the orchestration script (sets --junit-xml automatically): +VERIFY_JUNIT_XML=report.xml ./run_verification.sh + +# Directly against an already-running environment: +python3 verify.py --junit-xml report.xml +``` + +The XML report contains one `` per check. Skipped checks (e.g., Human Readability when masking didn't pass) are written as `` rather than `` so CI tools count them correctly. + +### Environment Variables (optional overrides) + +| Variable | Default | Description | +|----------|---------|-------------| +| `SOURCE_DB_HOST` | `localhost` | Source DB hostname | +| `SOURCE_DB_PORT` | `5433` | Source DB port (host-mapped) | +| `SOURCE_DB_NAME` | `source_db` | Source DB database name | +| `SOURCE_DB_USER` | `source_user` | Source DB username | +| `SOURCE_DB_PASS` | `source_pass` | Source DB password | +| `TARGET_DB_HOST` | `localhost` | Target DB hostname | +| `TARGET_DB_PORT` | `5434` | Target DB port (host-mapped) | +| `TARGET_DB_NAME` | `target_db` | Target DB database name | +| `TARGET_DB_USER` | `target_user` | Target DB username | +| `TARGET_DB_PASS` | `target_pass` | Target DB password | +| `VERIFY_JUNIT_XML` | *(unset)* | If set, `run_verification.sh` writes a JUnit XML report to this path | + +## Verification Checks + +### 1 · Record Integrity +Confirms the row count in SOURCE_DB matches TARGET_DB (both should be **50**). + +### 2 · Key Persistence +For every `id` (UUID) in SOURCE_DB, verifies the exact same `id` exists in +TARGET_DB. This proves the tool does **not** hash or alter primary keys. + +### 3 · Masking Effectiveness +Compares `full_name` and `email` for every matching `id`. The check **passes** +only if: + +``` +source.id == target.id AND +source.full_name != target.full_name AND +source.email != target.email +``` + +### 4 · Human Readability +Prints a sample of 5 masked records (ordered by `id`, for deterministic output) so a human can visually confirm the output looks realistic (e.g., a real-looking name and a valid e-mail address rather than random strings like `asdfghjkl`). + +The sample is only printed when Masking Effectiveness has already passed. If masking failed, this check is reported as **SKIP** (not FAIL) to avoid exposing potential real PII and to prevent it inflating the failure count in CI reports. + +### Sample Report Output + +``` +════════════════════════════════════════════════════════════ + OpenDataMask — Verification Report +════════════════════════════════════════════════════════════ + +Connecting to SOURCE_DB (localhost:5433/source_db)… +Connecting to TARGET_DB (localhost:5434/target_db)… + + -- Masked Record Sample (TARGET_DB) ---------------------------------- + [1] id : a1b2c3d4-0001-4000-8000-000000000001 + full_name : Johnathan Mraz + email : cordell.okon@yahoo.com + phone_number : 1-541-388-3947 + date_of_birth : Mon Jan 15 00:00:00 UTC 1990 + salary : 97432 + +------------------------------------------------------------ + Results +------------------------------------------------------------ + [✓] Record Integrity (row count matches): PASS + Source row count : 50 + Target row count : 50 + [✓] Key Persistence (all source IDs present in target): PASS + Source IDs : 50 + Target IDs : 50 + [✓] Masking Effectiveness (PII fields differ between source and target): PASS + Rows compared : 50 + Name unchanged (should be 0) : 0 + Email unchanged (should be 0) : 0 + [✓] Human Readability (sample of 5 masked records): PASS + +============================================================ + OK 4/4 CHECKS PASSED +============================================================ +``` + +When Masking Effectiveness fails the Human Readability check is skipped instead: + +``` + [–] Human Readability (sample of 5 masked records): SKIP + Sample skipped: masking effectiveness check did not pass. Printing TARGET_DB rows could expose real PII. + +============================================================ + FAIL 1/4 CHECK(S) FAILED (2 passed, 1 skipped) +============================================================ +``` + +## Masking Rules Applied + +| Column | Generator | Behaviour | +|--------|-----------|-----------| +| `id` | *(none — passthrough)* | UUID primary key is preserved exactly. | +| `full_name` | `FULL_NAME` | Replaced with a random realistic full name via Datafaker. | +| `email` | `EMAIL` | Replaced with a random realistic e-mail address. | +| `phone_number` | `PHONE` | Replaced with a random phone number. | +| `date_of_birth` | `BIRTH_DATE` | Replaced with a random birthday. | +| `salary` | `RANDOM_INT` (30 000–200 000) | Replaced with a random integer in range. | + +## Tearing Down + +```bash +cd verification/ +docker compose -f docker-compose.yml down -v +``` + +The `-v` flag also removes the named volume (`app_db_data`) so the next run +starts with a clean OpenDataMask application database. + +## GitHub Actions + +The workflow `.github/workflows/sandbox-verification.yml` runs this full verification suite automatically on every push and pull request to `main`, and can be triggered on demand via `workflow_dispatch`. + +It: + +1. Builds the backend Docker image from source (with layer caching). +2. Starts `source_db`, `target_db`, `app_db`, and `backend` via `docker compose`. +3. Orchestrates the masking job through the REST API (register → login → workspace → connections → table config → generators → trigger → poll). +4. Runs `verify.py --junit-xml` to produce a structured test report. +5. Publishes the report as a **workflow check** via `dorny/test-reporter` (per-check annotations on PRs). +6. Uploads the JUnit XML as a **downloadable artifact** (`sandbox-verification-report`, 30-day retention). +7. Writes a **markdown job summary** with overall pass/fail status. +8. Always tears down the sandbox; collects Docker container logs on failure. diff --git a/verification/docker-compose.yml b/verification/docker-compose.yml new file mode 100644 index 0000000..d9ea94e --- /dev/null +++ b/verification/docker-compose.yml @@ -0,0 +1,123 @@ +version: '3.8' + +# ───────────────────────────────────────────────────────────────────────────── +# OpenDataMask — Sandboxed Verification Environment +# +# Services +# source_db PostgreSQL database pre-seeded with 50 sensitive user records. +# target_db Empty PostgreSQL database; receives the masked data. +# app_db PostgreSQL database used by the OpenDataMask backend for its +# own application state (workspaces, jobs, configs, etc.). +# backend OpenDataMask Spring Boot backend. +# frontend OpenDataMask Vue.js frontend (optional for visual inspection). +# +# Secrets are loaded from a .env file (copy .env.example → .env). +# The defaults shown are for the sandbox ONLY — never reuse in production. +# ───────────────────────────────────────────────────────────────────────────── + +services: + + # ── Source database ──────────────────────────────────────────────────────── + source_db: + image: postgres:16-alpine + container_name: odm_source_db + environment: + POSTGRES_DB: ${SOURCE_DB_NAME:-source_db} + POSTGRES_USER: ${SOURCE_DB_USER:-source_user} + POSTGRES_PASSWORD: ${SOURCE_DB_PASS:-source_pass} + ports: + - "5433:5432" + volumes: + - ./init:/docker-entrypoint-initdb.d # runs source_db.sql on first start + networks: + - odm_net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $${POSTGRES_USER} -d $${POSTGRES_DB}"] + interval: 5s + timeout: 5s + retries: 10 + + # ── Target (destination) database ───────────────────────────────────────── + target_db: + image: postgres:16-alpine + container_name: odm_target_db + environment: + POSTGRES_DB: ${TARGET_DB_NAME:-target_db} + POSTGRES_USER: ${TARGET_DB_USER:-target_user} + POSTGRES_PASSWORD: ${TARGET_DB_PASS:-target_pass} + ports: + - "5434:5432" + networks: + - odm_net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U $${POSTGRES_USER} -d $${POSTGRES_DB}"] + interval: 5s + timeout: 5s + retries: 10 + + # ── OpenDataMask application database ───────────────────────────────────── + app_db: + image: postgres:16-alpine + container_name: odm_app_db + environment: + POSTGRES_DB: opendatamask + POSTGRES_USER: opendatamask + POSTGRES_PASSWORD: opendatamask + volumes: + - app_db_data:/var/lib/postgresql/data + networks: + - odm_net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U opendatamask"] + interval: 5s + timeout: 5s + retries: 10 + + # ── OpenDataMask backend ─────────────────────────────────────────────────── + backend: + build: + context: ../backend + dockerfile: Dockerfile + container_name: odm_backend + ports: + - "8080:8080" + environment: + DATABASE_URL: jdbc:postgresql://app_db:5432/opendatamask + DATABASE_USERNAME: opendatamask + DATABASE_PASSWORD: opendatamask + # NOTE: these secrets are loaded from .env (see .env.example). + # The fallback values are for the verification sandbox ONLY — never reuse in production. + JWT_SECRET: ${ODM_JWT_SECRET:-odm-verification-jwt-secret-sandbox-not-for-production-use-xyz} + ENCRYPTION_KEY: ${ODM_ENCRYPTION_KEY:-odm-verify-enc-key-sandbox-only} + SERVER_PORT: "8080" + depends_on: + app_db: + condition: service_healthy + networks: + - odm_net + healthcheck: + test: ["CMD-SHELL", "wget -qO- http://localhost:8080/actuator/health || exit 1"] + interval: 15s + timeout: 10s + retries: 12 + start_period: 90s + + # ── OpenDataMask frontend (optional) ────────────────────────────────────── + frontend: + build: + context: ../frontend + dockerfile: Dockerfile + container_name: odm_frontend + ports: + - "80:80" + depends_on: + - backend + networks: + - odm_net + +networks: + odm_net: + driver: bridge + +volumes: + app_db_data: diff --git a/verification/init/source_db.sql b/verification/init/source_db.sql new file mode 100644 index 0000000..40db1b9 --- /dev/null +++ b/verification/init/source_db.sql @@ -0,0 +1,64 @@ +-- Initialise SOURCE_DB: create the users table and insert 50 realistic records. +-- This script is executed automatically by the postgres:16-alpine container +-- via the /docker-entrypoint-initdb.d/ mount. + +CREATE TABLE IF NOT EXISTS users ( + id UUID PRIMARY KEY, + full_name TEXT NOT NULL, + email TEXT NOT NULL UNIQUE, + phone_number TEXT NOT NULL, + date_of_birth DATE NOT NULL, + salary NUMERIC(10,2) NOT NULL +); + +INSERT INTO users (id, full_name, email, phone_number, date_of_birth, salary) VALUES + ('a1b2c3d4-0001-4000-8000-000000000001','Alice Johnson', 'alice.johnson@example.com', '+1-555-0101','1985-03-15', 72500.00), + ('a1b2c3d4-0002-4000-8000-000000000002','Bob Martinez', 'bob.martinez@corp.net', '+1-555-0102','1990-07-22', 55000.00), + ('a1b2c3d4-0003-4000-8000-000000000003','Carol Williams', 'carol.w@techfirm.io', '+1-555-0103','1978-11-30', 98000.00), + ('a1b2c3d4-0004-4000-8000-000000000004','David Lee', 'david.lee@startup.co', '+1-555-0104','1995-02-08', 47000.00), + ('a1b2c3d4-0005-4000-8000-000000000005','Eva Brown', 'eva.brown@example.com', '+1-555-0105','1988-09-14', 63000.00), + ('a1b2c3d4-0006-4000-8000-000000000006','Frank Davis', 'frank.davis@enterprise.org', '+1-555-0106','1975-06-01', 112000.00), + ('a1b2c3d4-0007-4000-8000-000000000007','Grace Wilson', 'grace.wilson@agency.com', '+1-555-0107','1993-12-20', 58500.00), + ('a1b2c3d4-0008-4000-8000-000000000008','Henry Moore', 'henry.moore@corp.net', '+1-555-0108','1982-04-05', 87000.00), + ('a1b2c3d4-0009-4000-8000-000000000009','Irene Taylor', 'irene.taylor@techfirm.io', '+1-555-0109','1970-08-18', 134000.00), + ('a1b2c3d4-0010-4000-8000-000000000010','James Anderson', 'james.anderson@startup.co', '+1-555-0110','1998-01-27', 42000.00), + ('a1b2c3d4-0011-4000-8000-000000000011','Karen Thomas', 'karen.thomas@example.com', '+1-555-0111','1987-10-11', 76000.00), + ('a1b2c3d4-0012-4000-8000-000000000012','Liam Jackson', 'liam.jackson@enterprise.org', '+1-555-0112','1991-03-29', 53000.00), + ('a1b2c3d4-0013-4000-8000-000000000013','Mia White', 'mia.white@agency.com', '+1-555-0113','1984-07-17', 69000.00), + ('a1b2c3d4-0014-4000-8000-000000000014','Noah Harris', 'noah.harris@corp.net', '+1-555-0114','1996-11-03', 49500.00), + ('a1b2c3d4-0015-4000-8000-000000000015','Olivia Martin', 'olivia.martin@techfirm.io', '+1-555-0115','1979-05-22', 105000.00), + ('a1b2c3d4-0016-4000-8000-000000000016','Peter Garcia', 'peter.garcia@startup.co', '+1-555-0116','1994-09-06', 44000.00), + ('a1b2c3d4-0017-4000-8000-000000000017','Quinn Rodriguez', 'quinn.rodriguez@example.com', '+1-555-0117','1986-02-14', 82000.00), + ('a1b2c3d4-0018-4000-8000-000000000018','Rachel Lewis', 'rachel.lewis@enterprise.org', '+1-555-0118','1973-06-30', 118000.00), + ('a1b2c3d4-0019-4000-8000-000000000019','Samuel Lee', 'samuel.lee@agency.com', '+1-555-0119','1999-10-19', 38000.00), + ('a1b2c3d4-0020-4000-8000-000000000020','Tina Walker', 'tina.walker@corp.net', '+1-555-0120','1983-04-08', 91000.00), + ('a1b2c3d4-0021-4000-8000-000000000021','Umar Hall', 'umar.hall@techfirm.io', '+1-555-0121','1997-08-25', 46500.00), + ('a1b2c3d4-0022-4000-8000-000000000022','Vera Allen', 'vera.allen@startup.co', '+1-555-0122','1981-12-13', 77000.00), + ('a1b2c3d4-0023-4000-8000-000000000023','Walter Young', 'walter.young@example.com', '+1-555-0123','1968-03-02', 145000.00), + ('a1b2c3d4-0024-4000-8000-000000000024','Xena Hernandez', 'xena.hernandez@enterprise.org', '+1-555-0124','1992-07-21', 61000.00), + ('a1b2c3d4-0025-4000-8000-000000000025','Yusuf King', 'yusuf.king@agency.com', '+1-555-0125','1976-11-09', 127000.00), + ('a1b2c3d4-0026-4000-8000-000000000026','Zoe Wright', 'zoe.wright@corp.net', '+1-555-0126','1989-05-28', 74000.00), + ('a1b2c3d4-0027-4000-8000-000000000027','Aaron Scott', 'aaron.scott@techfirm.io', '+1-555-0127','1993-09-16', 57000.00), + ('a1b2c3d4-0028-4000-8000-000000000028','Bella Torres', 'bella.torres@startup.co', '+1-555-0128','1980-01-04', 99000.00), + ('a1b2c3d4-0029-4000-8000-000000000029','Carlos Nguyen', 'carlos.nguyen@example.com', '+1-555-0129','1995-06-23', 43000.00), + ('a1b2c3d4-0030-4000-8000-000000000030','Diana Hill', 'diana.hill@enterprise.org', '+1-555-0130','1972-10-12', 138000.00), + ('a1b2c3d4-0031-4000-8000-000000000031','Ethan Flores', 'ethan.flores@agency.com', '+1-555-0131','1988-02-01', 66000.00), + ('a1b2c3d4-0032-4000-8000-000000000032','Fiona Green', 'fiona.green@corp.net', '+1-555-0132','1977-05-19', 121000.00), + ('a1b2c3d4-0033-4000-8000-000000000033','George Adams', 'george.adams@techfirm.io', '+1-555-0133','1991-09-07', 51000.00), + ('a1b2c3d4-0034-4000-8000-000000000034','Hannah Nelson', 'hannah.nelson@startup.co', '+1-555-0134','1984-12-26', 84000.00), + ('a1b2c3d4-0035-4000-8000-000000000035','Isaac Carter', 'isaac.carter@example.com', '+1-555-0135','1998-04-14', 40000.00), + ('a1b2c3d4-0036-4000-8000-000000000036','Julia Mitchell', 'julia.mitchell@enterprise.org', '+1-555-0136','1971-08-03', 152000.00), + ('a1b2c3d4-0037-4000-8000-000000000037','Kevin Perez', 'kevin.perez@agency.com', '+1-555-0137','1994-12-22', 48000.00), + ('a1b2c3d4-0038-4000-8000-000000000038','Laura Roberts', 'laura.roberts@corp.net', '+1-555-0138','1986-04-10', 79000.00), + ('a1b2c3d4-0039-4000-8000-000000000039','Marcus Turner', 'marcus.turner@techfirm.io', '+1-555-0139','1979-08-29', 107000.00), + ('a1b2c3d4-0040-4000-8000-000000000040','Natalie Phillips', 'natalie.phillips@startup.co', '+1-555-0140','1996-01-17', 45000.00), + ('a1b2c3d4-0041-4000-8000-000000000041','Oscar Campbell', 'oscar.campbell@example.com', '+1-555-0141','1983-05-06', 93000.00), + ('a1b2c3d4-0042-4000-8000-000000000042','Penelope Parker', 'penelope.parker@enterprise.org', '+1-555-0142','1975-09-24', 114000.00), + ('a1b2c3d4-0043-4000-8000-000000000043','Quincy Evans', 'quincy.evans@agency.com', '+1-555-0143','1990-02-12', 60000.00), + ('a1b2c3d4-0044-4000-8000-000000000044','Rebecca Edwards', 'rebecca.edwards@corp.net', '+1-555-0144','1967-06-01', 167000.00), + ('a1b2c3d4-0045-4000-8000-000000000045','Simon Collins', 'simon.collins@techfirm.io', '+1-555-0145','1992-09-20', 55500.00), + ('a1b2c3d4-0046-4000-8000-000000000046','Teresa Stewart', 'teresa.stewart@startup.co', '+1-555-0146','1985-01-08', 71000.00), + ('a1b2c3d4-0047-4000-8000-000000000047','Ursula Sanchez', 'ursula.sanchez@example.com', '+1-555-0147','1997-05-27', 41500.00), + ('a1b2c3d4-0048-4000-8000-000000000048','Vincent Morris', 'vincent.morris@enterprise.org', '+1-555-0148','1974-09-15', 129000.00), + ('a1b2c3d4-0049-4000-8000-000000000049','Wendy Rogers', 'wendy.rogers@agency.com', '+1-555-0149','1988-01-03', 68000.00), + ('a1b2c3d4-0050-4000-8000-000000000050','Xavier Reed', 'xavier.reed@corp.net', '+1-555-0150','1982-04-22', 95000.00); diff --git a/verification/requirements.txt b/verification/requirements.txt new file mode 100644 index 0000000..04b95e4 --- /dev/null +++ b/verification/requirements.txt @@ -0,0 +1 @@ +psycopg2-binary>=2.9.0 diff --git a/verification/run_verification.sh b/verification/run_verification.sh new file mode 100755 index 0000000..ab08044 --- /dev/null +++ b/verification/run_verification.sh @@ -0,0 +1,254 @@ +#!/usr/bin/env bash +# run_verification.sh — End-to-end verification runner for OpenDataMask. +# +# This script: +# 1. Starts the sandboxed Docker environment (source_db, target_db, app_db, backend). +# 2. Waits for the backend service API to become healthy. +# 3. Configures OpenDataMask via its REST API (workspace, connections, +# table configuration, column generators). +# 4. Triggers a masking job and waits for it to complete. +# 5. Invokes verify.py to validate masking results. +# +# Prerequisites: docker compose (v2), curl, python3 (with pip module). +# Run from the repository root or from the verification/ directory. + +set -euo pipefail + +# ── Resolve paths ───────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# ── Colour helpers ──────────────────────────────────────────────────────────── +RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' +info() { echo -e "${GREEN}[INFO]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } +die() { error "$*"; exit 1; } + +# backend_is_healthy — returns 0 when /actuator/health reports status=UP, else 1. +backend_is_healthy() { + curl -sf "${API_BASE}/actuator/health" \ + | python3 -c ' +import json, sys +try: + d = json.load(sys.stdin) + sys.exit(0 if isinstance(d, dict) and d.get("status") == "UP" else 1) +except Exception: + sys.exit(1) +' +} + +# ── Configuration ───────────────────────────────────────────────────────────── +API_BASE="http://localhost:8080" +ODM_USER="verifier" +ODM_PASS="Verif1cation!Pass" +ODM_EMAIL="verifier@odm-sandbox.local" + +# ── DB credentials — read from env with same defaults as docker-compose.yml ── +SOURCE_DB_NAME="${SOURCE_DB_NAME:-source_db}" +SOURCE_DB_USER="${SOURCE_DB_USER:-source_user}" +SOURCE_DB_PASS="${SOURCE_DB_PASS:-source_pass}" +TARGET_DB_NAME="${TARGET_DB_NAME:-target_db}" +TARGET_DB_USER="${TARGET_DB_USER:-target_user}" +TARGET_DB_PASS="${TARGET_DB_PASS:-target_pass}" + +# ── Prerequisites check ─────────────────────────────────────────────────────── +info "Checking prerequisites…" +command -v docker >/dev/null 2>&1 || die "docker is required but not installed." +command -v curl >/dev/null 2>&1 || die "curl is required but not installed." +command -v python3 >/dev/null 2>&1 || die "python3 is required but not installed." + +# Support both `docker compose` (v2) and `docker-compose` (v1) +if docker compose version >/dev/null 2>&1; then + DC="docker compose" +elif command -v docker-compose >/dev/null 2>&1; then + DC="docker-compose" +else + die "docker compose (v2) or docker-compose (v1) is required but not found." +fi + +# ── Install Python dependencies ─────────────────────────────────────────────── +# Use `python3 -m pip` to avoid a hard dependency on a separately-installed pip3. +info "Installing Python dependencies…" +python3 -m pip install -q -r requirements.txt + +# ── Start Docker environment ────────────────────────────────────────────────── +info "Starting Docker environment…" +$DC -f docker-compose.yml up -d --build + +# ── Wait for backend health ─────────────────────────────────────────────────── +info "Waiting for OpenDataMask backend to become healthy (up to 3 min)…" +MAX_WAIT=180 +ELAPSED=0 +until backend_is_healthy; do + if [ $ELAPSED -ge $MAX_WAIT ]; then + die "Backend did not become healthy within ${MAX_WAIT}s." + fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo -n "." +done +echo "" +info "Backend is healthy." + +# ── Helper: call the API ────────────────────────────────────────────────────── +# api_post → response body +api_post() { + local path="$1" body="$2" + curl -sf -X POST "${API_BASE}${path}" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${TOKEN:-}" \ + -d "$body" +} + +# api_get → response body +api_get() { + local path="$1" + curl -sf -X GET "${API_BASE}${path}" \ + -H "Authorization: Bearer ${TOKEN:-}" +} + +# ── Register user (ignore error if already exists) ──────────────────────────── +info "Registering user '${ODM_USER}'…" +curl -sf -X POST "${API_BASE}/api/auth/register" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${ODM_USER}\",\"email\":\"${ODM_EMAIL}\",\"password\":\"${ODM_PASS}\"}" \ + > /dev/null 2>&1 || true # silently continue if user already exists + +# ── Login ───────────────────────────────────────────────────────────────────── +info "Logging in…" +LOGIN_RESP=$(curl -sf -X POST "${API_BASE}/api/auth/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"${ODM_USER}\",\"password\":\"${ODM_PASS}\"}" \ + || die "Login request failed. Check that the backend is running and reachable at ${API_BASE}.") + +TOKEN=$(echo "$LOGIN_RESP" \ + | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('token',''))" \ + 2>/dev/null || true) +[ -n "$TOKEN" ] || die "Failed to obtain JWT token. Login response: ${LOGIN_RESP}" +info "Authenticated successfully." + +# ── Create workspace ────────────────────────────────────────────────────────── +info "Creating workspace…" +WS_RESP=$(api_post "/api/workspaces" \ + '{"name":"Verification Workspace","description":"Automated PII masking verification"}') +WS_ID=$(echo "$WS_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +info "Workspace created: id=${WS_ID}" + +# ── Create source connection ────────────────────────────────────────────────── +info "Creating source data connection (SOURCE_DB)…" +SRC_RESP=$(api_post "/api/workspaces/${WS_ID}/connections" \ + "{\"name\":\"source-db\",\"type\":\"POSTGRESQL\", + \"connectionString\":\"jdbc:postgresql://source_db:5432/${SOURCE_DB_NAME}\", + \"username\":\"${SOURCE_DB_USER}\",\"password\":\"${SOURCE_DB_PASS}\", + \"isSource\":true,\"isDestination\":false}") +SRC_CONN_ID=$(echo "$SRC_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +info "Source connection created: id=${SRC_CONN_ID}" + +# ── Create destination connection ───────────────────────────────────────────── +info "Creating destination data connection (TARGET_DB)…" +DST_RESP=$(api_post "/api/workspaces/${WS_ID}/connections" \ + "{\"name\":\"target-db\",\"type\":\"POSTGRESQL\", + \"connectionString\":\"jdbc:postgresql://target_db:5432/${TARGET_DB_NAME}\", + \"username\":\"${TARGET_DB_USER}\",\"password\":\"${TARGET_DB_PASS}\", + \"isSource\":false,\"isDestination\":true}") +DST_CONN_ID=$(echo "$DST_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +info "Destination connection created: id=${DST_CONN_ID}" + +# ── Create table configuration (MASK mode) ──────────────────────────────────── +info "Creating table configuration for 'users' (MASK mode)…" +TABLE_RESP=$(api_post "/api/workspaces/${WS_ID}/tables" \ + '{"tableName":"users","mode":"MASK"}') +TABLE_ID=$(echo "$TABLE_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +info "Table configuration created: id=${TABLE_ID}" + +# ── Add column generators ───────────────────────────────────────────────────── +# The 'id' column has no generator → it is passed through unchanged (PK preserved). + +add_generator() { + local col="$1" gtype="$2" params="${3:-}" + # Build JSON payload via Python so that generatorParams is properly serialised + # as a JSON *string* value (the backend field is String?, not an embedded object). + # sys.argv avoids any shell-quoting issues with special characters in params. + if [ -z "$params" ]; then + BODY=$(python3 -c " +import json, sys +print(json.dumps({'columnName': sys.argv[1], 'generatorType': sys.argv[2]})) +" -- "$col" "$gtype") + else + BODY=$(python3 -c " +import json, sys +print(json.dumps({'columnName': sys.argv[1], 'generatorType': sys.argv[2], 'generatorParams': sys.argv[3]})) +" -- "$col" "$gtype" "$params") + fi + api_post "/api/workspaces/${WS_ID}/tables/${TABLE_ID}/generators" "$BODY" > /dev/null + info " Generator added: ${col} → ${gtype}" +} + +info "Configuring column generators…" +add_generator "full_name" "FULL_NAME" +add_generator "email" "EMAIL" +add_generator "phone_number" "PHONE" +add_generator "date_of_birth" "BIRTH_DATE" +add_generator "salary" "RANDOM_INT" '{"min":"30000","max":"200000"}' + +# ── Run masking job ─────────────────────────────────────────────────────────── +info "Triggering masking job…" +JOB_RESP=$(api_post "/api/workspaces/${WS_ID}/jobs" '{}') +JOB_ID=$(echo "$JOB_RESP" | python3 -c "import sys,json; print(json.load(sys.stdin)['id'])") +info "Job started: id=${JOB_ID}" + +# ── Poll until job completes ────────────────────────────────────────────────── +info "Waiting for job ${JOB_ID} to complete…" +MAX_WAIT=120 +ELAPSED=0 +while true; do + STATUS=$(api_get "/api/workspaces/${WS_ID}/jobs/${JOB_ID}" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") + if [ "$STATUS" = "COMPLETED" ]; then + info "Job completed successfully." + break + elif [ "$STATUS" = "FAILED" ] || [ "$STATUS" = "CANCELLED" ]; then + # Print job logs for debugging + warn "Job ended with status: ${STATUS}. Fetching logs…" + api_get "/api/workspaces/${WS_ID}/jobs/${JOB_ID}/logs" \ + | python3 -c " +import sys, json +logs = json.load(sys.stdin) +for l in logs: + print(f'[{l[\"level\"]}] {l[\"message\"]}') +" + die "Masking job ${JOB_ID} did not complete successfully (status=${STATUS})." + fi + if [ $ELAPSED -ge $MAX_WAIT ]; then + die "Job did not complete within ${MAX_WAIT}s." + fi + sleep 5 + ELAPSED=$((ELAPSED + 5)) + echo -n "." +done +echo "" + +# ── Run Python verification ─────────────────────────────────────────────────── +# Use `if/else` so that a non-zero exit from verify.py is caught by our +# explicit handler — not by `set -e` — ensuring the result banner always prints. +# +# Set VERIFY_JUNIT_XML to a file path to also write a JUnit XML report, e.g.: +# VERIFY_JUNIT_XML=/tmp/report.xml ./run_verification.sh +info "Running verification script…" +JUNIT_ARGS=() +if [ -n "${VERIFY_JUNIT_XML:-}" ]; then + JUNIT_ARGS=(--junit-xml "${VERIFY_JUNIT_XML}") +fi +if python3 verify.py "${JUNIT_ARGS[@]}"; then + echo "" + echo -e "${GREEN}════════════════════════════════════════${NC}" + echo -e "${GREEN} ✓ ALL VERIFICATION CHECKS PASSED ${NC}" + echo -e "${GREEN}════════════════════════════════════════${NC}" +else + echo "" + echo -e "${RED}════════════════════════════════════════${NC}" + echo -e "${RED} ✗ ONE OR MORE VERIFICATION CHECKS FAILED ${NC}" + echo -e "${RED}════════════════════════════════════════${NC}" + exit 1 +fi diff --git a/verification/verify.py b/verification/verify.py new file mode 100644 index 0000000..30919ca --- /dev/null +++ b/verification/verify.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +""" +verify.py — OpenDataMask Sandboxed Verification Script +======================================================= +Connects to SOURCE_DB and TARGET_DB after a masking job has run and +performs the following automated checks: + + 1. Record Integrity — row count in SOURCE matches TARGET. + 2. Key Persistence — every id present in SOURCE exists in TARGET. + 3. Masking Effectiveness — full_name and email are different between + source and target for every row. + 4. Human Readability — logs a sample of 5 masked records so a human + can visually confirm the data looks realistic. + +Outputs a "Verification Report" summarising pass / fail status. + +Environment variables (with defaults matching docker-compose.yml): + SOURCE_DB_HOST / SOURCE_DB_PORT / SOURCE_DB_NAME + SOURCE_DB_USER / SOURCE_DB_PASS + TARGET_DB_HOST / TARGET_DB_PORT / TARGET_DB_NAME + TARGET_DB_USER / TARGET_DB_PASS +""" + +import argparse +import os +import sys +import time +import xml.etree.ElementTree as ET + +try: + import psycopg2 + import psycopg2.extras + from psycopg2 import sql as pgsql +except ImportError: + print("ERROR: psycopg2 is not installed. Run: pip install psycopg2-binary") + sys.exit(1) + + +# ── Database connection parameters ────────────────────────────────────────── + +SOURCE = dict( + host=os.getenv("SOURCE_DB_HOST", "localhost"), + port=int(os.getenv("SOURCE_DB_PORT", "5433")), + dbname=os.getenv("SOURCE_DB_NAME", "source_db"), + user=os.getenv("SOURCE_DB_USER", "source_user"), + password=os.getenv("SOURCE_DB_PASS", "source_pass"), +) + +TARGET = dict( + host=os.getenv("TARGET_DB_HOST", "localhost"), + port=int(os.getenv("TARGET_DB_PORT", "5434")), + dbname=os.getenv("TARGET_DB_NAME", "target_db"), + user=os.getenv("TARGET_DB_USER", "target_user"), + password=os.getenv("TARGET_DB_PASS", "target_pass"), +) + +TABLE = "users" + + +# ── Helpers ────────────────────────────────────────────────────────────────── + +class Check: + PASS = "PASS" + FAIL = "FAIL" + SKIP = "SKIP" + + def __init__(self, name: str): + self.name = name + self.status = Check.PASS + self.messages: list[str] = [] + + def fail(self, msg: str) -> None: + self.status = Check.FAIL + self.messages.append(msg) + + def skip(self, msg: str) -> None: + self.status = Check.SKIP + self.messages.append(msg) + + def info(self, msg: str) -> None: + self.messages.append(msg) + + @property + def info_messages(self) -> list[str]: + """Return messages that are not status-constant strings.""" + _statuses = {Check.PASS, Check.FAIL, Check.SKIP} + return [m for m in self.messages if m not in _statuses] + + def __str__(self) -> str: + icon = "✓" if self.status == Check.PASS else ("–" if self.status == Check.SKIP else "✗") + lines = [f" [{icon}] {self.name}: {self.status}"] + for m in self.messages: + lines.append(f" {m}") + return "\n".join(lines) + + +def connect(params: dict): + try: + conn = psycopg2.connect(**params, cursor_factory=psycopg2.extras.RealDictCursor) + conn.autocommit = True + return conn + except psycopg2.OperationalError as exc: + print(f"ERROR: Cannot connect to database {params['dbname']}@{params['host']}:{params['port']}") + print(f" {exc}") + sys.exit(1) + + +def fetch_all(conn, query, params=None) -> list[dict]: + with conn.cursor() as cur: + cur.execute(query, params) + return cur.fetchall() + + +def count_rows(conn, table: str) -> int: + # Use pgsql.Identifier to safely quote the table name and prevent SQL injection. + query = pgsql.SQL("SELECT COUNT(*) AS cnt FROM {}").format(pgsql.Identifier(table)) + rows = fetch_all(conn, query) + return rows[0]["cnt"] + + +# ── Verification checks ─────────────────────────────────────────────────────── + +def check_record_integrity(src_conn, tgt_conn) -> Check: + chk = Check("Record Integrity (row count matches)") + src_count = count_rows(src_conn, TABLE) + tgt_count = count_rows(tgt_conn, TABLE) + chk.info(f"Source row count : {src_count}") + chk.info(f"Target row count : {tgt_count}") + if src_count == 0: + chk.fail( + f"Source table '{TABLE}' is empty; verification cannot pass with 0 source rows" + ) + elif src_count != tgt_count: + chk.fail( + f"Row count mismatch: source={src_count}, target={tgt_count}" + ) + return chk + + +def check_key_persistence(src_conn, tgt_conn) -> Check: + chk = Check("Key Persistence (all source IDs present in target)") + id_query = pgsql.SQL("SELECT id FROM {}").format(pgsql.Identifier(TABLE)) + src_ids = {str(r["id"]) for r in fetch_all(src_conn, id_query)} + tgt_ids = {str(r["id"]) for r in fetch_all(tgt_conn, id_query)} + + missing = src_ids - tgt_ids + extra = tgt_ids - src_ids + chk.info(f"Source IDs : {len(src_ids)}") + chk.info(f"Target IDs : {len(tgt_ids)}") + + if missing: + chk.fail( + f"{len(missing)} source ID(s) missing from target: " + f"{sorted(missing)[:5]}{'...' if len(missing) > 5 else ''}" + ) + if extra: + chk.fail( + f"{len(extra)} unexpected ID(s) found only in target: " + f"{sorted(extra)[:5]}{'...' if len(extra) > 5 else ''}" + ) + return chk + + +def check_masking_effectiveness(src_conn, tgt_conn) -> Check: + chk = Check("Masking Effectiveness (PII fields differ between source and target)") + + pii_query = pgsql.SQL("SELECT id, full_name, email FROM {}").format( + pgsql.Identifier(TABLE) + ) + src_rows = {str(r["id"]): r for r in fetch_all(src_conn, pii_query)} + tgt_rows = {str(r["id"]): r for r in fetch_all(tgt_conn, pii_query)} + + unmasked_name = 0 + unmasked_email = 0 + checked = 0 + + for uid, src in src_rows.items(): + tgt = tgt_rows.get(uid) + if tgt is None: + continue + checked += 1 + if src["full_name"] == tgt["full_name"]: + unmasked_name += 1 + if src["email"] == tgt["email"]: + unmasked_email += 1 + + chk.info(f"Rows compared : {checked}") + chk.info(f"Name unchanged (should be 0) : {unmasked_name}") + chk.info(f"Email unchanged (should be 0) : {unmasked_email}") + + if checked == 0: + chk.fail("No rows could be compared (source or target may be empty).") + else: + if unmasked_name > 0: + chk.fail(f"{unmasked_name} row(s) have the same full_name in source and target.") + if unmasked_email > 0: + chk.fail(f"{unmasked_email} row(s) have the same email in source and target.") + + return chk + + +def check_human_readability(tgt_conn, masking_passed: bool = True) -> Check: + """ + Print a sample of masked records for visual human inspection. + + The sample is only printed when *masking_passed* is True. If masking + effectiveness failed, the target may still contain real source data, so + printing it here could expose genuine PII — in that case we skip the + sample and report the reason. + + When masking has passed, the values printed are the anonymised (fake) + output produced by OpenDataMask's Datafaker-powered generators. + """ + chk = Check("Human Readability (sample of 5 masked records)") + + if not masking_passed: + chk.skip( + "Sample skipped: masking effectiveness check did not pass. " + "Printing TARGET_DB rows could expose real PII." + ) + return chk + + # ORDER BY id gives a stable, deterministic sample across runs. + sample_query = pgsql.SQL( + "SELECT id, full_name, email, phone_number, date_of_birth, salary " + "FROM {} ORDER BY id LIMIT 5" + ).format(pgsql.Identifier(TABLE)) + # Values retrieved here are already-anonymised fakes, not real sensitive data. + sample = fetch_all(tgt_conn, sample_query) + + print("\n -- Masked Record Sample (TARGET_DB) ----------------------------------") + for i, row in enumerate(sample, 1): + # All fields below are Datafaker-generated fakes. + print(f" [{i}] id : {row['id']}") + print(f" full_name : {row['full_name']}") + print(f" email : {row['email']}") + print(f" phone_number : {row['phone_number']}") + print(f" date_of_birth : {row['date_of_birth']}") + print(f" salary : {row['salary']}") + print() + + # Heuristic: Faker-generated full names always contain at least one space + # (first name + last name). A missing space suggests the generator may not + # be producing realistic output. + suspicious_names = [ + str(row["full_name"]) + for row in sample + if " " not in str(row["full_name"]) + ] + if suspicious_names: + chk.fail( + f"The following masked names do not look like realistic full names " + f"(no space found): {suspicious_names}" + ) + + # Masked emails must contain '@' to be valid e-mail addresses. + bad_emails = [ + str(row["email"]) + for row in sample + if "@" not in str(row["email"]) + ] + if bad_emails: + chk.fail(f"The following masked emails are missing '@': {bad_emails}") + + return chk + +# ── JUnit XML writer ────────────────────────────────────────────────────────── + +def write_junit_xml(checks: list, elapsed: float, path: str) -> None: + """Write a JUnit-compatible XML report to *path* for CI consumption.""" + failures = sum(1 for c in checks if c.status == Check.FAIL) + skipped = sum(1 for c in checks if c.status == Check.SKIP) + suite = ET.Element( + "testsuite", + name="OpenDataMask Sandbox Verification", + tests=str(len(checks)), + failures=str(failures), + skipped=str(skipped), + errors="0", + time=f"{elapsed:.3f}", + ) + for chk in checks: + tc = ET.SubElement( + suite, + "testcase", + name=chk.name, + classname="verify", + ) + if chk.status == Check.FAIL: + failure_msg = "; ".join(chk.info_messages) + ET.SubElement(tc, "failure", message=failure_msg).text = failure_msg + elif chk.status == Check.SKIP: + skip_msg = "; ".join(chk.info_messages) + ET.SubElement(tc, "skipped", message=skip_msg) + # Attach informational messages as system-out so they appear in the report. + if chk.info_messages: + ET.SubElement(tc, "system-out").text = "\n".join(chk.info_messages) + + # Indent for readability (Python ≥ 3.9). + if hasattr(ET, "indent"): + ET.indent(suite, space=" ") + + with open(path, "w", encoding="utf-8") as fh: + fh.write('\n') + fh.write(ET.tostring(suite, encoding="unicode")) + fh.write("\n") + + print(f" JUnit XML report written to: {path}") + + +# ── Main ────────────────────────────────────────────────────────────────────── + +def main() -> int: + parser = argparse.ArgumentParser( + description="OpenDataMask sandboxed verification script." + ) + parser.add_argument( + "--junit-xml", + metavar="PATH", + default=None, + help="Write a JUnit-compatible XML report to PATH for CI consumption.", + ) + args = parser.parse_args() + + print("\n" + "=" * 60) + print(" OpenDataMask -- Verification Report") + print("=" * 60) + + print( + f"\nConnecting to SOURCE_DB " + f"({SOURCE['host']}:{SOURCE['port']}/{SOURCE['dbname']})..." + ) + src_conn = connect(SOURCE) + + print( + f"Connecting to TARGET_DB " + f"({TARGET['host']}:{TARGET['port']}/{TARGET['dbname']})..." + ) + tgt_conn = connect(TARGET) + + t_start = time.monotonic() + + checks = [ + check_record_integrity(src_conn, tgt_conn), + check_key_persistence(src_conn, tgt_conn), + ] + + masking_chk = check_masking_effectiveness(src_conn, tgt_conn) + checks.append(masking_chk) + + # Only print TARGET_DB sample when masking has been confirmed effective — + # if masking failed the target may still hold real source data. + checks.append( + check_human_readability( + tgt_conn, masking_passed=(masking_chk.status == Check.PASS) + ) + ) + + elapsed = time.monotonic() - t_start + + src_conn.close() + tgt_conn.close() + + print("\n" + "-" * 60) + print(" Results") + print("-" * 60) + for chk in checks: + print(chk) + + passed = sum(1 for c in checks if c.status == Check.PASS) + failed = sum(1 for c in checks if c.status == Check.FAIL) + skipped = sum(1 for c in checks if c.status == Check.SKIP) + + print("\n" + "=" * 60) + if failed == 0: + summary = f" OK {passed}/{len(checks)} CHECKS PASSED" + if skipped: + summary += f" ({skipped} skipped)" + print(summary) + else: + print(f" FAIL {failed}/{len(checks)} CHECK(S) FAILED ({passed} passed, {skipped} skipped)") + print("=" * 60 + "\n") + + if args.junit_xml: + write_junit_xml(checks, elapsed, args.junit_xml) + + return 0 if failed == 0 else 1 + + +if __name__ == "__main__": + sys.exit(main())