From 7e126dac229e94e0ba60ffad35e42ff48c2e3088 Mon Sep 17 00:00:00 2001 From: PRAteek-singHWY Date: Mon, 6 Apr 2026 03:45:50 +0530 Subject: [PATCH 1/2] feat: add automated Heroku data health equivalency checks --- .github/workflows/data-health-check.yml | 97 ++++++++++++++++++ application/tests/data_health_test.py | 90 +++++++++++++++++ application/utils/data_health.py | 127 ++++++++++++++++++++++++ scripts/check_data_health.py | 85 ++++++++++++++++ 4 files changed, 399 insertions(+) create mode 100644 .github/workflows/data-health-check.yml create mode 100644 application/tests/data_health_test.py create mode 100644 application/utils/data_health.py create mode 100644 scripts/check_data_health.py diff --git a/.github/workflows/data-health-check.yml b/.github/workflows/data-health-check.yml new file mode 100644 index 000000000..62002c874 --- /dev/null +++ b/.github/workflows/data-health-check.yml @@ -0,0 +1,97 @@ +name: Data Health Check + +on: + schedule: + - cron: "30 1 * * *" + workflow_dispatch: + +permissions: + contents: read + actions: read + +jobs: + data-health-check: + environment: Heroku-DB-Backup + runs-on: ubuntu-latest + env: + HEROKU_APP_NAME: opencreorg + PGPASSWORD: postgres + services: + postgres: + image: postgres:16 + env: + POSTGRES_DB: postgres + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + ports: + - 5432:5432 + options: >- + --health-cmd "pg_isready -U postgres" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y postgresql-client + python -m pip install --upgrade pip psycopg2-binary + + - name: Install Heroku CLI + run: curl https://cli-assets.heroku.com/install-ubuntu.sh | sh + + - name: Download known-good backup artifact + env: + GH_TOKEN: ${{ github.token }} + run: | + set -euo pipefail + mkdir -p known-good + run_id="$(gh run list --repo OWASP/OpenCRE --workflow backup.yml --branch main --status success --limit 1 --json databaseId --jq '.[0].databaseId')" + if [ -z "${run_id}" ] || [ "${run_id}" = "null" ]; then + echo "Could not find successful backup workflow runs on main." + exit 1 + fi + + gh run download "${run_id}" --repo OWASP/OpenCRE --name opencreorg_db_backup --dir known-good + known_good_dump="$(find known-good -maxdepth 1 -name '*.dump' | head -n 1)" + if [ -z "${known_good_dump}" ]; then + echo "No .dump file found in opencreorg_db_backup artifact." + exit 1 + fi + echo "KNOWN_GOOD_DUMP=${known_good_dump}" >> "${GITHUB_ENV}" + + - name: Download current Heroku backup + env: + HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }} + run: | + set -euo pipefail + heroku pg:backups:capture -a "${HEROKU_APP_NAME}" + heroku pg:backups:download -a "${HEROKU_APP_NAME}" --output=current.dump + echo "CURRENT_DUMP=${PWD}/current.dump" >> "${GITHUB_ENV}" + + - name: Restore dumps to local postgres + run: | + set -euo pipefail + psql -h localhost -U postgres -d postgres -c "DROP DATABASE IF EXISTS opencre_known_good;" + psql -h localhost -U postgres -d postgres -c "DROP DATABASE IF EXISTS opencre_current;" + psql -h localhost -U postgres -d postgres -c "CREATE DATABASE opencre_known_good;" + psql -h localhost -U postgres -d postgres -c "CREATE DATABASE opencre_current;" + + pg_restore --clean --if-exists --no-owner --no-privileges -h localhost -U postgres -d opencre_known_good "${KNOWN_GOOD_DUMP}" + pg_restore --clean --if-exists --no-owner --no-privileges -h localhost -U postgres -d opencre_current "${CURRENT_DUMP}" + + - name: Compare datasets + run: | + python scripts/check_data_health.py \ + --db1-url "postgresql://postgres:postgres@localhost:5432/opencre_known_good" \ + --db2-url "postgresql://postgres:postgres@localhost:5432/opencre_current" \ + --db1-label "known-good" \ + --db2-label "heroku-current" diff --git a/application/tests/data_health_test.py b/application/tests/data_health_test.py new file mode 100644 index 000000000..208a52d17 --- /dev/null +++ b/application/tests/data_health_test.py @@ -0,0 +1,90 @@ +import unittest + +from application.utils import data_health + + +class TestDataHealth(unittest.TestCase): + def _dataset(self, cre_id: str, child_id: str, node_id: str): + return { + "cre": [ + { + "id": cre_id, + "external_id": "100-100", + "name": "Authentication", + "description": "Base auth requirement", + "tags": "auth,session", + }, + { + "id": child_id, + "external_id": "100-101", + "name": "Session timeout", + "description": "Timeout policy", + "tags": "session", + }, + ], + "node": [ + { + "id": node_id, + "name": "ASVS", + "section": "V2", + "subsection": "2.1.1", + "section_id": "ASVS-V2-2.1.1", + "version": "4.0", + "description": "ASVS mapping entry", + "tags": "asvs", + "ntype": "Standard", + "link": "https://example.com", + } + ], + "cre_links": [ + { + "type": "Contains", + "group": cre_id, + "cre": child_id, + } + ], + "cre_node_links": [ + { + "type": "Linked To", + "cre": child_id, + "node": node_id, + } + ], + } + + def test_equivalent_when_only_internal_ids_differ(self): + left_rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1") + right_rows = self._dataset("cre-id-a", "cre-id-b", "node-id-a") + + left = data_health.build_canonical_snapshot(left_rows) + right = data_health.build_canonical_snapshot(right_rows) + + self.assertEqual( + data_health.snapshot_digest(left), data_health.snapshot_digest(right) + ) + self.assertEqual(data_health.snapshot_diff(left, right), {}) + + def test_detects_data_change(self): + left_rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1") + right_rows = self._dataset("cre-id-a", "cre-id-b", "node-id-a") + right_rows["node"][0]["description"] = "Changed description" + + left = data_health.build_canonical_snapshot(left_rows) + right = data_health.build_canonical_snapshot(right_rows) + + self.assertNotEqual( + data_health.snapshot_digest(left), data_health.snapshot_digest(right) + ) + diff = data_health.snapshot_diff(left, right) + self.assertIn("node", diff) + + def test_raises_on_missing_foreign_key_target(self): + rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1") + rows["cre_links"][0]["cre"] = "unknown-cre-id" + + with self.assertRaises(ValueError): + data_health.build_canonical_snapshot(rows) + + +if __name__ == "__main__": + unittest.main() diff --git a/application/utils/data_health.py b/application/utils/data_health.py new file mode 100644 index 000000000..e8b0466c9 --- /dev/null +++ b/application/utils/data_health.py @@ -0,0 +1,127 @@ +import hashlib +import json +from typing import Any, Dict, List, Mapping, Sequence, Tuple + + +Snapshot = Dict[str, List[Tuple[Any, ...]]] + +REQUIRED_TABLES = ("cre", "node", "cre_links", "cre_node_links") + + +def _normalize(value: Any) -> str: + if value is None: + return "" + return str(value) + + +def _cre_key(row: Mapping[str, Any]) -> Tuple[str, ...]: + return ( + _normalize(row.get("external_id")), + _normalize(row.get("name")), + _normalize(row.get("description")), + _normalize(row.get("tags")), + ) + + +def _node_key(row: Mapping[str, Any]) -> Tuple[str, ...]: + return ( + _normalize(row.get("name")), + _normalize(row.get("section")), + _normalize(row.get("subsection")), + _normalize(row.get("section_id")), + _normalize(row.get("version")), + _normalize(row.get("description")), + _normalize(row.get("tags")), + _normalize(row.get("ntype")), + _normalize(row.get("link")), + ) + + +def _validate_table_presence(rows: Mapping[str, Sequence[Mapping[str, Any]]]) -> None: + missing = [table for table in REQUIRED_TABLES if table not in rows] + if missing: + raise ValueError(f"Missing required tables: {missing}") + + +def build_canonical_snapshot( + rows: Mapping[str, Sequence[Mapping[str, Any]]], +) -> Snapshot: + _validate_table_presence(rows) + + cre_id_to_key: Dict[str, Tuple[str, ...]] = {} + node_id_to_key: Dict[str, Tuple[str, ...]] = {} + + cre_rows: List[Tuple[str, ...]] = [] + node_rows: List[Tuple[str, ...]] = [] + cre_links_rows: List[Tuple[Any, ...]] = [] + cre_node_links_rows: List[Tuple[Any, ...]] = [] + + for row in rows["cre"]: + key = _cre_key(row) + row_id = _normalize(row.get("id")) + cre_id_to_key[row_id] = key + cre_rows.append(key) + + for row in rows["node"]: + key = _node_key(row) + row_id = _normalize(row.get("id")) + node_id_to_key[row_id] = key + node_rows.append(key) + + for row in rows["cre_links"]: + group_id = _normalize(row.get("group")) + cre_id = _normalize(row.get("cre")) + if group_id not in cre_id_to_key or cre_id not in cre_id_to_key: + raise ValueError( + f"cre_links contains unknown IDs: group={group_id}, cre={cre_id}" + ) + cre_links_rows.append( + ( + _normalize(row.get("type")), + cre_id_to_key[group_id], + cre_id_to_key[cre_id], + ) + ) + + for row in rows["cre_node_links"]: + cre_id = _normalize(row.get("cre")) + node_id = _normalize(row.get("node")) + if cre_id not in cre_id_to_key or node_id not in node_id_to_key: + raise ValueError( + f"cre_node_links contains unknown IDs: cre={cre_id}, node={node_id}" + ) + cre_node_links_rows.append( + ( + _normalize(row.get("type")), + cre_id_to_key[cre_id], + node_id_to_key[node_id], + ) + ) + + snapshot: Snapshot = { + "cre": sorted(cre_rows), + "node": sorted(node_rows), + "cre_links": sorted(cre_links_rows), + "cre_node_links": sorted(cre_node_links_rows), + } + return snapshot + + +def snapshot_digest(snapshot: Snapshot) -> str: + payload = json.dumps(snapshot, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def snapshot_diff(expected: Snapshot, actual: Snapshot) -> Dict[str, Dict[str, Any]]: + diff: Dict[str, Dict[str, Any]] = {} + for table in REQUIRED_TABLES: + missing = sorted(set(expected.get(table, [])) - set(actual.get(table, []))) + extra = sorted(set(actual.get(table, [])) - set(expected.get(table, []))) + if missing or extra: + diff[table] = { + "missing_count": len(missing), + "extra_count": len(extra), + "missing_sample": missing[:3], + "extra_sample": extra[:3], + } + return diff diff --git a/scripts/check_data_health.py b/scripts/check_data_health.py new file mode 100644 index 000000000..9e2a15111 --- /dev/null +++ b/scripts/check_data_health.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +import argparse +import json +import sys +from typing import Any, Dict, List + +import psycopg2 +from psycopg2.extras import RealDictCursor + +from application.utils import data_health + + +def _fetch_rows(conn: Any, query: str) -> List[Dict[str, Any]]: + with conn.cursor(cursor_factory=RealDictCursor) as cursor: + cursor.execute(query) + return [dict(row) for row in cursor.fetchall()] + + +def load_snapshot(db_url: str) -> data_health.Snapshot: + conn = psycopg2.connect(db_url) + try: + rows = { + "cre": _fetch_rows( + conn, "SELECT id, external_id, name, description, tags FROM cre" + ), + "node": _fetch_rows( + conn, + "SELECT id, name, section, subsection, section_id, version, " + "description, tags, ntype, link FROM node", + ), + "cre_links": _fetch_rows(conn, 'SELECT type, "group", cre FROM cre_links'), + "cre_node_links": _fetch_rows( + conn, "SELECT type, cre, node FROM cre_node_links" + ), + } + return data_health.build_canonical_snapshot(rows) + finally: + conn.close() + + +def _counts(snapshot: data_health.Snapshot) -> Dict[str, int]: + return {table: len(rows) for table, rows in snapshot.items()} + + +def _to_json(data: Any) -> str: + return json.dumps(data, indent=2, default=str) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compare OpenCRE data equivalency between two PostgreSQL databases." + ) + parser.add_argument("--db1-url", required=True, help="Known-good database URL.") + parser.add_argument("--db2-url", required=True, help="Current database URL.") + parser.add_argument("--db1-label", default="db1", help="Label for database 1.") + parser.add_argument("--db2-label", default="db2", help="Label for database 2.") + args = parser.parse_args() + + snapshot_1 = load_snapshot(args.db1_url) + snapshot_2 = load_snapshot(args.db2_url) + + digest_1 = data_health.snapshot_digest(snapshot_1) + digest_2 = data_health.snapshot_digest(snapshot_2) + + print(f"{args.db1_label} counts: {_to_json(_counts(snapshot_1))}") + print(f"{args.db2_label} counts: {_to_json(_counts(snapshot_2))}") + print(f"{args.db1_label} digest: {digest_1}") + print(f"{args.db2_label} digest: {digest_2}") + + if digest_1 == digest_2: + print("Data health check passed: datasets are equivalent.") + return 0 + + diff_1_to_2 = data_health.snapshot_diff(snapshot_1, snapshot_2) + diff_2_to_1 = data_health.snapshot_diff(snapshot_2, snapshot_1) + print("Data health check failed: dataset mismatch detected.") + print(f"{args.db1_label} -> {args.db2_label} diff:") + print(_to_json(diff_1_to_2)) + print(f"{args.db2_label} -> {args.db1_label} diff:") + print(_to_json(diff_2_to_1)) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) From 5589391b9d2d369930f7ea956fca2a8122714939 Mon Sep 17 00:00:00 2001 From: PRAteek-singHWY Date: Mon, 6 Apr 2026 04:29:39 +0530 Subject: [PATCH 2/2] chore: remove hardcoded postgres password literals from workflow --- .github/workflows/data-health-check.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/data-health-check.yml b/.github/workflows/data-health-check.yml index 62002c874..f999c643d 100644 --- a/.github/workflows/data-health-check.yml +++ b/.github/workflows/data-health-check.yml @@ -15,14 +15,13 @@ jobs: runs-on: ubuntu-latest env: HEROKU_APP_NAME: opencreorg - PGPASSWORD: postgres services: postgres: image: postgres:16 env: POSTGRES_DB: postgres POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres + POSTGRES_HOST_AUTH_METHOD: trust ports: - 5432:5432 options: >- @@ -91,7 +90,7 @@ jobs: - name: Compare datasets run: | python scripts/check_data_health.py \ - --db1-url "postgresql://postgres:postgres@localhost:5432/opencre_known_good" \ - --db2-url "postgresql://postgres:postgres@localhost:5432/opencre_current" \ + --db1-url "postgresql://postgres@localhost:5432/opencre_known_good" \ + --db2-url "postgresql://postgres@localhost:5432/opencre_current" \ --db1-label "known-good" \ --db2-label "heroku-current"