Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions migrations/0021_fingerprint_town_removal.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
-- ============================================================
-- Migration 0021: Fingerprint town-segment removal
--
-- Context: In May 2026 GOV.UK changed the register CSV layout to
-- Sponsor Licence Number, Organisation Name, TierRating,
-- Migrant Classification, Sponsor Status
-- The Town/City column no longer exists, so the runtime fingerprint
-- (normalizedName|normalizedTown|route) is now generated as
-- (normalizedName||route) with an empty middle segment.
--
-- Stored fingerprints still carry the old town segment, so no row in
-- the database can ever match a fingerprint generated from the new
-- feed. This migration rewrites every stored fingerprint from
-- name|town|route → name||route
-- and deduplicates rows that collapse onto the same new fingerprint
-- (same company name + route previously listed in multiple towns).
--
-- The rewrite is idempotent: fingerprints already in name||route form
-- are unchanged by the regexp.
-- ============================================================

BEGIN;

-- ── Helper expression used throughout ────────────────────────────────────────
-- regexp_replace(fp, '^([^|]*)\|[^|]*\|', '\1||')
-- "acme ltd|london|skilled worker" → "acme ltd||skilled worker"
-- "acme ltd||skilled worker" → unchanged (idempotent)

-- ── 1. sponsor_canonical (unique fingerprint) ────────────────────────────────
-- Deduplicate first: rows collapsing to the same new fingerprint keep the
-- "best" row — live status preferred, then earliest first_seen, then lowest id.
WITH ranked AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')

Check failure on line 36 in migrations/0021_fingerprint_town_removal.sql

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal 15 times.

See more on https://sonarcloud.io/project/issues?id=Sam-Aitech_Checkbyai.net&issues=AZ601cvnEwbPSLU0Wqxt&open=AZ601cvnEwbPSLU0Wqxt&pullRequest=47
ORDER BY
(status NOT IN ('REMOVED_REVOKED')) DESC,
first_seen ASC,
id ASC
) AS rn
FROM sponsor_canonical
)
DELETE FROM sponsor_canonical sc
USING ranked r
WHERE sc.id = r.id AND r.rn > 1;

UPDATE sponsor_canonical
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

Check failure on line 50 in migrations/0021_fingerprint_town_removal.sql

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal 10 times.

See more on https://sonarcloud.io/project/issues?id=Sam-Aitech_Checkbyai.net&issues=AZ601cvnEwbPSLU0Wqxu&open=AZ601cvnEwbPSLU0Wqxu&pullRequest=47

-- ── 2. sponsor_changes (non-unique) ──────────────────────────────────────────
UPDATE sponsor_changes
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint IS NOT NULL
AND fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 3. company_watches (non-unique) ──────────────────────────────────────────
UPDATE company_watches
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint IS NOT NULL
AND fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 4. sponsor_enrichment (unique fingerprint) ───────────────────────────────
WITH ranked AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
ORDER BY id ASC
) AS rn
FROM sponsor_enrichment
)
DELETE FROM sponsor_enrichment se
USING ranked r
WHERE se.id = r.id AND r.rn > 1;

UPDATE sponsor_enrichment
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 5. sponsor_licence_timeline (unique fingerprint+recorded_date+source) ────
WITH ranked AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY
regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||'),
recorded_date,
source
ORDER BY id ASC
) AS rn
FROM sponsor_licence_timeline
)
DELETE FROM sponsor_licence_timeline t
USING ranked r
WHERE t.id = r.id AND r.rn > 1;

UPDATE sponsor_licence_timeline
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 6. enrichment_queue (unique fingerprint+job_type) ────────────────────────
WITH ranked AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY
regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||'),
job_type
ORDER BY id ASC
) AS rn
FROM enrichment_queue
)
DELETE FROM enrichment_queue q
USING ranked r
WHERE q.id = r.id AND r.rn > 1;

UPDATE enrichment_queue
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 7. job_listings (non-unique) ─────────────────────────────────────────────
UPDATE job_listings
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 8. job_alert_preferences (unique user_id+fingerprint) ────────────────────
WITH ranked AS (
SELECT
id,
ROW_NUMBER() OVER (
PARTITION BY
user_id,
regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
ORDER BY id ASC
) AS rn
FROM job_alert_preferences
)
DELETE FROM job_alert_preferences p
USING ranked r
WHERE p.id = r.id AND r.rn > 1;

UPDATE job_alert_preferences
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 9. sponsor_staging (transient ETL table, non-unique) ─────────────────────
UPDATE sponsor_staging
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint ~ '^[^|]*\|[^|]+\|';

-- ── 10. sponsor_list (legacy; dropped by 0011 in some environments) ──────────
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'sponsor_list') THEN
UPDATE sponsor_list
SET fingerprint = regexp_replace(fingerprint, '^([^|]*)\|[^|]*\|', '\1||')
WHERE fingerprint IS NOT NULL
AND fingerprint ~ '^[^|]*\|[^|]+\|';
END IF;
END $$;

COMMIT;
126 changes: 126 additions & 0 deletions server/utils/__tests__/sponsorCsvNewFormat.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* sponsorCsvNewFormat.test.ts
*
* Regression coverage for the GOV.UK register CSV schema change (May 2026).
*
* Old format: Organisation Name,Town/City,County,Type & Rating,Route
* New format: Sponsor Licence Number,Organisation Name,TierRating,
* Migrant Classification,Sponsor Status
*
* Before the fix, every new-format row was rejected by Zod validation
* (typeRating empty, licenceType underivable), which produced an empty
* fingerprinted CSV. csvdiff then saw the whole register as deleted and
* the state machine mass-removed all 143K sponsors (2026-05-20 incident).
*/
import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
import * as fs from "fs";

Check warning on line 16 in server/utils/__tests__/sponsorCsvNewFormat.test.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Prefer `node:fs` over `fs`.

See more on https://sonarcloud.io/project/issues?id=Sam-Aitech_Checkbyai.net&issues=AZ601craEwbPSLU0Wqxp&open=AZ601craEwbPSLU0Wqxp&pullRequest=47
import * as os from "os";

Check warning on line 17 in server/utils/__tests__/sponsorCsvNewFormat.test.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Prefer `node:os` over `os`.

See more on https://sonarcloud.io/project/issues?id=Sam-Aitech_Checkbyai.net&issues=AZ601craEwbPSLU0Wqxq&open=AZ601craEwbPSLU0Wqxq&pullRequest=47
import * as path from "path";

Check warning on line 18 in server/utils/__tests__/sponsorCsvNewFormat.test.ts

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Prefer `node:path` over `path`.

See more on https://sonarcloud.io/project/issues?id=Sam-Aitech_Checkbyai.net&issues=AZ601craEwbPSLU0Wqxr&open=AZ601craEwbPSLU0Wqxr&pullRequest=47

vi.mock("../adminAlert", () => ({
sendAdminAlert: vi.fn().mockResolvedValue(undefined),
}));
vi.mock("../binaryRunner", () => ({
qsvValidate: vi.fn().mockResolvedValue({ ok: true }),
qsvCount: vi.fn().mockResolvedValue(0),
}));

import { parseCsvFile } from "../csvArchiver";
import { buildFingerprintedCsv, loadFingerprintSet } from "../csvFingerprintBuilder";
import { generateFingerprint } from "../sponsorListFetcher";
import { SponsorRowSchema, deriveSponsorRowEnums } from "../sponsorRowSchema";

const NEW_FORMAT_HEADER =
"Sponsor Licence Number,Organisation Name,TierRating,Migrant Classification,Sponsor Status";

const NEW_FORMAT_ROWS = [
`3DJDP93B8,"""K"" Line Energy Shipping (UK) Limited",Worker (A rating),Skilled Worker,Licensed and Fully Active`,
`ABC123XYZ,Acme Global Ltd,Worker (A rating),Skilled Worker,Licensed and Fully Active`,
`DEF456UVW,Beta Care Homes Ltd,Temporary Worker (A rating),Seasonal Worker,Licensed and Fully Active`,
].join("\n");

const OLD_FORMAT_CSV = [
"Organisation Name,Town/City,County,Type & Rating,Route",
"Acme Global Ltd,London,Greater London,Worker (A rating),Skilled Worker",
].join("\n");

let tmpDir: string;

beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "sponsor-csv-test-"));
});

afterEach(() => {
fs.rmSync(tmpDir, { recursive: true, force: true });
});

function writeTmpCsv(name: string, content: string): string {
const p = path.join(tmpDir, name);
fs.writeFileSync(p, content, "utf-8");
return p;
}

describe("new-format GOV.UK CSV — parseCsvFile", () => {
it("accepts all rows from the new 5-column register format", async () => {
const csvPath = writeTmpCsv("new.csv", `${NEW_FORMAT_HEADER}\n${NEW_FORMAT_ROWS}`);
const records = await parseCsvFile(csvPath);

expect(records).toHaveLength(3);
expect(records[1]).toMatchObject({
organisationName: "Acme Global Ltd",
typeRating: "Worker (A rating)",
route: "Skilled Worker",
});
});

it("still accepts the legacy 5-column format", async () => {
const csvPath = writeTmpCsv("old.csv", OLD_FORMAT_CSV);
const records = await parseCsvFile(csvPath);

expect(records).toHaveLength(1);
expect(records[0]).toMatchObject({
organisationName: "Acme Global Ltd",
townCity: "London",
typeRating: "Worker (A rating)",
route: "Skilled Worker",
});
});
});

describe("new-format GOV.UK CSV — buildFingerprintedCsv", () => {
it("writes a fingerprint for every new-format row", async () => {
const csvPath = writeTmpCsv("new.csv", `${NEW_FORMAT_HEADER}\n${NEW_FORMAT_ROWS}`);
const outPath = path.join(tmpDir, "new.fingerprinted.csv");

await buildFingerprintedCsv(csvPath, outPath);
const fpSet = await loadFingerprintSet(outPath);

expect(fpSet.size).toBe(3);
expect(fpSet.has(generateFingerprint("Acme Global Ltd", "", "Skilled Worker"))).toBe(true);
});
});

describe("new-format GOV.UK CSV — SponsorRowSchema derivation", () => {
it("derives enums and validates a row mapped from the new format", () => {
const derived = deriveSponsorRowEnums({
statusRaw: "Licensed and Fully Active",
ratingRaw: "Worker (A rating)",
typeRating: "Worker (A rating)",
licenceTypeRaw: null,
});

expect(derived.licenceStatus).toBe("Active");
expect(derived.rating).toBe("A-RATING");
expect(derived.licenceType).toBe("WORKER");

const result = SponsorRowSchema.safeParse({
organisationName: "Acme Global Ltd",
townCity: null,
county: null,
typeRating: "Worker (A rating)",
route: "Skilled Worker",
...derived,
});
expect(result.success).toBe(true);
});
});
40 changes: 13 additions & 27 deletions server/utils/csvArchiver.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { qsvValidate, qsvCount } from "./binaryRunner";
import { sendAdminAlert } from "./adminAlert";
import { buildFingerprintedCsv, fingerprintedCsvPath } from "./csvFingerprintBuilder";
import type { SponsorRecord } from "./sponsorListFetcher";
import { resolveSponsorCsvColumns, type SponsorCsvColumnIndexes } from "./sponsorCsvColumns";
import {
SponsorRowSchema,
deriveSponsorRowEnums,
Expand Down Expand Up @@ -118,34 +119,11 @@ function detectHtmlContent(filePath: string): boolean {
}
}

// ── Column resolution (duplicated here to avoid circular import) ──────────────

interface ColumnIndexes {
nameIdx: number;
townIdx: number;
countyIdx: number;
typeIdx: number;
routeIdx: number;
statusIdx: number;
licenceTypeIdx: number;
ratingIdx: number;
lastUpdatedIdx: number;
}
// ── Column resolution (shared with csvFingerprintBuilder) ─────────────────────

function resolveColumnIndexes(header: string[]): ColumnIndexes {
const h = header.map((s) => s.trim().toLowerCase());
return {
nameIdx: h.findIndex((c) => c.includes("organisation") && c.includes("name")),
townIdx: h.findIndex((c) => c.includes("town") || c.includes("city")),
countyIdx: h.findIndex((c) => c.includes("county")),
typeIdx: h.findIndex((c) => c.includes("type") && c.includes("rating")),
routeIdx: h.findIndex((c) => c.includes("route")),
statusIdx: h.findIndex((c) => c.includes("status")),
licenceTypeIdx: h.findIndex((c) => c.includes("licence") && c.includes("type")),
ratingIdx: h.findIndex((c) => c.includes("rating") && !c.includes("type")),
lastUpdatedIdx: h.findIndex((c) => c.includes("last") && c.includes("updated")),
};
}
type ColumnIndexes = SponsorCsvColumnIndexes;

const resolveColumnIndexes = resolveSponsorCsvColumns;

// ── Core public API ───────────────────────────────────────────────────────────

Expand Down Expand Up @@ -599,6 +577,14 @@ export async function parseCsvFile(filePath: string): Promise<SponsorRecord[]> {
"🔴 CheckByAI: Sponsor CSV schema-change event detected",
`${buildSchemaChangeAlertHtml(`CsvArchiver file: ${filePath}`, summary)}`,
);
// Abort instead of returning a near-empty record set. Feeding a gutted
// parse result into the diff engine reads as "the entire register was
// deleted" and triggers mass removals (2026-05-20 incident).
throw new Error(
`[CsvArchiver] Aborting: ${rowsRejected.toLocaleString()} of ${totalRowsProcessed.toLocaleString()} ` +
`CSV rows rejected by validation — probable register schema change. ` +
`Rejection reasons: ${JSON.stringify(rejectionReasons)}`,
);
}

return sponsors;
Expand Down
Loading
Loading