From 7e126dac229e94e0ba60ffad35e42ff48c2e3088 Mon Sep 17 00:00:00 2001
From: PRAteek-singHWY <prateek23022004@gmail.com>
Date: Mon, 6 Apr 2026 03:45:50 +0530
Subject: [PATCH 1/2] feat: add automated Heroku data health equivalency checks

---
 .github/workflows/data-health-check.yml |  97 ++++++++++++++++++
 application/tests/data_health_test.py   |  90 +++++++++++++++++
 application/utils/data_health.py        | 127 ++++++++++++++++++++++++
 scripts/check_data_health.py            |  85 ++++++++++++++++
 4 files changed, 399 insertions(+)
 create mode 100644 .github/workflows/data-health-check.yml
 create mode 100644 application/tests/data_health_test.py
 create mode 100644 application/utils/data_health.py
 create mode 100644 scripts/check_data_health.py

diff --git a/.github/workflows/data-health-check.yml b/.github/workflows/data-health-check.yml
new file mode 100644
index 000000000..62002c874
--- /dev/null
+++ b/.github/workflows/data-health-check.yml
@@ -0,0 +1,97 @@
+name: Data Health Check
+
+on:
+  schedule:
+    - cron: "30 1 * * *"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  actions: read
+
+jobs:
+  data-health-check:
+    environment: Heroku-DB-Backup
+    runs-on: ubuntu-latest
+    env:
+      HEROKU_APP_NAME: opencreorg
+      PGPASSWORD: postgres
+    services:
+      postgres:
+        image: postgres:16
+        env:
+          POSTGRES_DB: postgres
+          POSTGRES_USER: postgres
+          POSTGRES_PASSWORD: postgres
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd "pg_isready -U postgres"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y postgresql-client
+          python -m pip install --upgrade pip psycopg2-binary
+
+      - name: Install Heroku CLI
+        run: curl https://cli-assets.heroku.com/install-ubuntu.sh | sh
+
+      - name: Download known-good backup artifact
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -euo pipefail
+          mkdir -p known-good
+          run_id="$(gh run list --repo OWASP/OpenCRE --workflow backup.yml --branch main --status success --limit 1 --json databaseId --jq '.[0].databaseId')"
+          if [ -z "${run_id}" ] || [ "${run_id}" = "null" ]; then
+            echo "Could not find successful backup workflow runs on main."
+            exit 1
+          fi
+
+          gh run download "${run_id}" --repo OWASP/OpenCRE --name opencreorg_db_backup --dir known-good
+          known_good_dump="$(find known-good -maxdepth 1 -name '*.dump' | head -n 1)"
+          if [ -z "${known_good_dump}" ]; then
+            echo "No .dump file found in opencreorg_db_backup artifact."
+            exit 1
+          fi
+          echo "KNOWN_GOOD_DUMP=${known_good_dump}" >> "${GITHUB_ENV}"
+
+      - name: Download current Heroku backup
+        env:
+          HEROKU_API_KEY: ${{ secrets.HEROKU_API_KEY }}
+        run: |
+          set -euo pipefail
+          heroku pg:backups:capture -a "${HEROKU_APP_NAME}"
+          heroku pg:backups:download -a "${HEROKU_APP_NAME}" --output=current.dump
+          echo "CURRENT_DUMP=${PWD}/current.dump" >> "${GITHUB_ENV}"
+
+      - name: Restore dumps to local postgres
+        run: |
+          set -euo pipefail
+          psql -h localhost -U postgres -d postgres -c "DROP DATABASE IF EXISTS opencre_known_good;"
+          psql -h localhost -U postgres -d postgres -c "DROP DATABASE IF EXISTS opencre_current;"
+          psql -h localhost -U postgres -d postgres -c "CREATE DATABASE opencre_known_good;"
+          psql -h localhost -U postgres -d postgres -c "CREATE DATABASE opencre_current;"
+
+          pg_restore --clean --if-exists --no-owner --no-privileges -h localhost -U postgres -d opencre_known_good "${KNOWN_GOOD_DUMP}"
+          pg_restore --clean --if-exists --no-owner --no-privileges -h localhost -U postgres -d opencre_current "${CURRENT_DUMP}"
+
+      - name: Compare datasets
+        run: |
+          python scripts/check_data_health.py \
+            --db1-url "postgresql://postgres:postgres@localhost:5432/opencre_known_good" \
+            --db2-url "postgresql://postgres:postgres@localhost:5432/opencre_current" \
+            --db1-label "known-good" \
+            --db2-label "heroku-current"
diff --git a/application/tests/data_health_test.py b/application/tests/data_health_test.py
new file mode 100644
index 000000000..208a52d17
--- /dev/null
+++ b/application/tests/data_health_test.py
@@ -0,0 +1,90 @@
+import unittest
+
+from application.utils import data_health
+
+
+class TestDataHealth(unittest.TestCase):
+    def _dataset(self, cre_id: str, child_id: str, node_id: str):
+        return {
+            "cre": [
+                {
+                    "id": cre_id,
+                    "external_id": "100-100",
+                    "name": "Authentication",
+                    "description": "Base auth requirement",
+                    "tags": "auth,session",
+                },
+                {
+                    "id": child_id,
+                    "external_id": "100-101",
+                    "name": "Session timeout",
+                    "description": "Timeout policy",
+                    "tags": "session",
+                },
+            ],
+            "node": [
+                {
+                    "id": node_id,
+                    "name": "ASVS",
+                    "section": "V2",
+                    "subsection": "2.1.1",
+                    "section_id": "ASVS-V2-2.1.1",
+                    "version": "4.0",
+                    "description": "ASVS mapping entry",
+                    "tags": "asvs",
+                    "ntype": "Standard",
+                    "link": "https://example.com",
+                }
+            ],
+            "cre_links": [
+                {
+                    "type": "Contains",
+                    "group": cre_id,
+                    "cre": child_id,
+                }
+            ],
+            "cre_node_links": [
+                {
+                    "type": "Linked To",
+                    "cre": child_id,
+                    "node": node_id,
+                }
+            ],
+        }
+
+    def test_equivalent_when_only_internal_ids_differ(self):
+        left_rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1")
+        right_rows = self._dataset("cre-id-a", "cre-id-b", "node-id-a")
+
+        left = data_health.build_canonical_snapshot(left_rows)
+        right = data_health.build_canonical_snapshot(right_rows)
+
+        self.assertEqual(
+            data_health.snapshot_digest(left), data_health.snapshot_digest(right)
+        )
+        self.assertEqual(data_health.snapshot_diff(left, right), {})
+
+    def test_detects_data_change(self):
+        left_rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1")
+        right_rows = self._dataset("cre-id-a", "cre-id-b", "node-id-a")
+        right_rows["node"][0]["description"] = "Changed description"
+
+        left = data_health.build_canonical_snapshot(left_rows)
+        right = data_health.build_canonical_snapshot(right_rows)
+
+        self.assertNotEqual(
+            data_health.snapshot_digest(left), data_health.snapshot_digest(right)
+        )
+        diff = data_health.snapshot_diff(left, right)
+        self.assertIn("node", diff)
+
+    def test_raises_on_missing_foreign_key_target(self):
+        rows = self._dataset("cre-id-1", "cre-id-2", "node-id-1")
+        rows["cre_links"][0]["cre"] = "unknown-cre-id"
+
+        with self.assertRaises(ValueError):
+            data_health.build_canonical_snapshot(rows)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/application/utils/data_health.py b/application/utils/data_health.py
new file mode 100644
index 000000000..e8b0466c9
--- /dev/null
+++ b/application/utils/data_health.py
@@ -0,0 +1,127 @@
+import hashlib
+import json
+from typing import Any, Dict, List, Mapping, Sequence, Tuple
+
+
+Snapshot = Dict[str, List[Tuple[Any, ...]]]
+
+REQUIRED_TABLES = ("cre", "node", "cre_links", "cre_node_links")
+
+
+def _normalize(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value)
+
+
+def _cre_key(row: Mapping[str, Any]) -> Tuple[str, ...]:
+    return (
+        _normalize(row.get("external_id")),
+        _normalize(row.get("name")),
+        _normalize(row.get("description")),
+        _normalize(row.get("tags")),
+    )
+
+
+def _node_key(row: Mapping[str, Any]) -> Tuple[str, ...]:
+    return (
+        _normalize(row.get("name")),
+        _normalize(row.get("section")),
+        _normalize(row.get("subsection")),
+        _normalize(row.get("section_id")),
+        _normalize(row.get("version")),
+        _normalize(row.get("description")),
+        _normalize(row.get("tags")),
+        _normalize(row.get("ntype")),
+        _normalize(row.get("link")),
+    )
+
+
+def _validate_table_presence(rows: Mapping[str, Sequence[Mapping[str, Any]]]) -> None:
+    missing = [table for table in REQUIRED_TABLES if table not in rows]
+    if missing:
+        raise ValueError(f"Missing required tables: {missing}")
+
+
+def build_canonical_snapshot(
+    rows: Mapping[str, Sequence[Mapping[str, Any]]],
+) -> Snapshot:
+    _validate_table_presence(rows)
+
+    cre_id_to_key: Dict[str, Tuple[str, ...]] = {}
+    node_id_to_key: Dict[str, Tuple[str, ...]] = {}
+
+    cre_rows: List[Tuple[str, ...]] = []
+    node_rows: List[Tuple[str, ...]] = []
+    cre_links_rows: List[Tuple[Any, ...]] = []
+    cre_node_links_rows: List[Tuple[Any, ...]] = []
+
+    for row in rows["cre"]:
+        key = _cre_key(row)
+        row_id = _normalize(row.get("id"))
+        cre_id_to_key[row_id] = key
+        cre_rows.append(key)
+
+    for row in rows["node"]:
+        key = _node_key(row)
+        row_id = _normalize(row.get("id"))
+        node_id_to_key[row_id] = key
+        node_rows.append(key)
+
+    for row in rows["cre_links"]:
+        group_id = _normalize(row.get("group"))
+        cre_id = _normalize(row.get("cre"))
+        if group_id not in cre_id_to_key or cre_id not in cre_id_to_key:
+            raise ValueError(
+                f"cre_links contains unknown IDs: group={group_id}, cre={cre_id}"
+            )
+        cre_links_rows.append(
+            (
+                _normalize(row.get("type")),
+                cre_id_to_key[group_id],
+                cre_id_to_key[cre_id],
+            )
+        )
+
+    for row in rows["cre_node_links"]:
+        cre_id = _normalize(row.get("cre"))
+        node_id = _normalize(row.get("node"))
+        if cre_id not in cre_id_to_key or node_id not in node_id_to_key:
+            raise ValueError(
+                f"cre_node_links contains unknown IDs: cre={cre_id}, node={node_id}"
+            )
+        cre_node_links_rows.append(
+            (
+                _normalize(row.get("type")),
+                cre_id_to_key[cre_id],
+                node_id_to_key[node_id],
+            )
+        )
+
+    snapshot: Snapshot = {
+        "cre": sorted(cre_rows),
+        "node": sorted(node_rows),
+        "cre_links": sorted(cre_links_rows),
+        "cre_node_links": sorted(cre_node_links_rows),
+    }
+    return snapshot
+
+
+def snapshot_digest(snapshot: Snapshot) -> str:
+    payload = json.dumps(snapshot, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def snapshot_diff(expected: Snapshot, actual: Snapshot) -> Dict[str, Dict[str, Any]]:
+    diff: Dict[str, Dict[str, Any]] = {}
+    for table in REQUIRED_TABLES:
+        missing = sorted(set(expected.get(table, [])) - set(actual.get(table, [])))
+        extra = sorted(set(actual.get(table, [])) - set(expected.get(table, [])))
+        if missing or extra:
+            diff[table] = {
+                "missing_count": len(missing),
+                "extra_count": len(extra),
+                "missing_sample": missing[:3],
+                "extra_sample": extra[:3],
+            }
+    return diff
diff --git a/scripts/check_data_health.py b/scripts/check_data_health.py
new file mode 100644
index 000000000..9e2a15111
--- /dev/null
+++ b/scripts/check_data_health.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import sys
+from typing import Any, Dict, List
+
+import psycopg2
+from psycopg2.extras import RealDictCursor
+
+from application.utils import data_health
+
+
+def _fetch_rows(conn: Any, query: str) -> List[Dict[str, Any]]:
+    with conn.cursor(cursor_factory=RealDictCursor) as cursor:
+        cursor.execute(query)
+        return [dict(row) for row in cursor.fetchall()]
+
+
+def load_snapshot(db_url: str) -> data_health.Snapshot:
+    conn = psycopg2.connect(db_url)
+    try:
+        rows = {
+            "cre": _fetch_rows(
+                conn, "SELECT id, external_id, name, description, tags FROM cre"
+            ),
+            "node": _fetch_rows(
+                conn,
+                "SELECT id, name, section, subsection, section_id, version, "
+                "description, tags, ntype, link FROM node",
+            ),
+            "cre_links": _fetch_rows(conn, 'SELECT type, "group", cre FROM cre_links'),
+            "cre_node_links": _fetch_rows(
+                conn, "SELECT type, cre, node FROM cre_node_links"
+            ),
+        }
+        return data_health.build_canonical_snapshot(rows)
+    finally:
+        conn.close()
+
+
+def _counts(snapshot: data_health.Snapshot) -> Dict[str, int]:
+    return {table: len(rows) for table, rows in snapshot.items()}
+
+
+def _to_json(data: Any) -> str:
+    return json.dumps(data, indent=2, default=str)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Compare OpenCRE data equivalency between two PostgreSQL databases."
+    )
+    parser.add_argument("--db1-url", required=True, help="Known-good database URL.")
+    parser.add_argument("--db2-url", required=True, help="Current database URL.")
+    parser.add_argument("--db1-label", default="db1", help="Label for database 1.")
+    parser.add_argument("--db2-label", default="db2", help="Label for database 2.")
+    args = parser.parse_args()
+
+    snapshot_1 = load_snapshot(args.db1_url)
+    snapshot_2 = load_snapshot(args.db2_url)
+
+    digest_1 = data_health.snapshot_digest(snapshot_1)
+    digest_2 = data_health.snapshot_digest(snapshot_2)
+
+    print(f"{args.db1_label} counts: {_to_json(_counts(snapshot_1))}")
+    print(f"{args.db2_label} counts: {_to_json(_counts(snapshot_2))}")
+    print(f"{args.db1_label} digest: {digest_1}")
+    print(f"{args.db2_label} digest: {digest_2}")
+
+    if digest_1 == digest_2:
+        print("Data health check passed: datasets are equivalent.")
+        return 0
+
+    diff_1_to_2 = data_health.snapshot_diff(snapshot_1, snapshot_2)
+    diff_2_to_1 = data_health.snapshot_diff(snapshot_2, snapshot_1)
+    print("Data health check failed: dataset mismatch detected.")
+    print(f"{args.db1_label} -> {args.db2_label} diff:")
+    print(_to_json(diff_1_to_2))
+    print(f"{args.db2_label} -> {args.db1_label} diff:")
+    print(_to_json(diff_2_to_1))
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 5589391b9d2d369930f7ea956fca2a8122714939 Mon Sep 17 00:00:00 2001
From: PRAteek-singHWY <prateek23022004@gmail.com>
Date: Mon, 6 Apr 2026 04:29:39 +0530
Subject: [PATCH 2/2] chore: remove hardcoded postgres password literals from
 workflow

---
 .github/workflows/data-health-check.yml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/data-health-check.yml b/.github/workflows/data-health-check.yml
index 62002c874..f999c643d 100644
--- a/.github/workflows/data-health-check.yml
+++ b/.github/workflows/data-health-check.yml
@@ -15,14 +15,13 @@ jobs:
     runs-on: ubuntu-latest
     env:
       HEROKU_APP_NAME: opencreorg
-      PGPASSWORD: postgres
     services:
       postgres:
         image: postgres:16
         env:
           POSTGRES_DB: postgres
           POSTGRES_USER: postgres
-          POSTGRES_PASSWORD: postgres
+          POSTGRES_HOST_AUTH_METHOD: trust
         ports:
           - 5432:5432
         options: >-
@@ -91,7 +90,7 @@ jobs:
       - name: Compare datasets
         run: |
           python scripts/check_data_health.py \
-            --db1-url "postgresql://postgres:postgres@localhost:5432/opencre_known_good" \
-            --db2-url "postgresql://postgres:postgres@localhost:5432/opencre_current" \
+            --db1-url "postgresql://postgres@localhost:5432/opencre_known_good" \
+            --db2-url "postgresql://postgres@localhost:5432/opencre_current" \
             --db1-label "known-good" \
             --db2-label "heroku-current"