diff --git a/docs/civil-id-s3-recovery-audit.md b/docs/civil-id-s3-recovery-audit.md new file mode 100644 index 00000000..74a9063a --- /dev/null +++ b/docs/civil-id-s3-recovery-audit.md @@ -0,0 +1,68 @@ +# Civil ID S3 Recovery Audit + +Issue #55 Phase 8 asks for a way to identify Civil ID records whose database fields still point at `photos/` objects that may have been deleted by the accidental lifecycle rule. + +This workflow is intentionally offline. It uses sanitized exports from the database and S3 inventories, then classifies each Civil ID file as present, recoverable from the legacy permanent prefix, recoverable from the temp bucket export, or missing. + +## Inputs + +Export candidate Civil ID references from the database: + +```sql +SELECT candidate_id, 'front' AS side, candidate_civil_photo_front AS filename, + CONCAT('photos/', candidate_civil_photo_front) AS expected_s3_key, candidate_updated_at +FROM candidate +WHERE candidate_civil_photo_front IS NOT NULL AND candidate_civil_photo_front <> '' +UNION ALL +SELECT candidate_id, 'back' AS side, candidate_civil_photo_back AS filename, + CONCAT('photos/', candidate_civil_photo_back) AS expected_s3_key, candidate_updated_at +FROM candidate +WHERE candidate_civil_photo_back IS NOT NULL AND candidate_civil_photo_back <> '' +ORDER BY candidate_updated_at DESC; +``` + +Export object-key inventories separately for: + +- `studenthub-uploads` permanent bucket +- `studenthub-public-anyone-can-upload-24hr-expiry` temp bucket, if still available for the incident window + +The object inventory files can be one key per line, or CSV-style output where any cell containing a `photos/` or `candidate-civil-id/` key is used. + +Do not place real candidate records or bucket exports in this repository. + +## Usage + +```bash +node tools/audit-civil-id-s3-objects.mjs \ + --candidates /secure/path/civil-id-candidates.csv \ + --permanent-objects /secure/path/studenthub-uploads-keys.txt \ + --temp-objects /secure/path/temp-bucket-keys.txt \ + --emit-copy-commands +``` + +Use CSV output when the result needs to be filtered or attached to an internal incident tracker: + +```bash +node tools/audit-civil-id-s3-objects.mjs \ + --candidates /secure/path/civil-id-candidates.csv \ + --permanent-objects /secure/path/studenthub-uploads-keys.txt \ + --temp-objects /secure/path/temp-bucket-keys.txt \ + --format csv +``` + +## Statuses + +- `present`: `studenthub-uploads/photos/` exists in the permanent bucket export. +- `recover_from_legacy`: `studenthub-uploads/candidate-civil-id/` exists and should be copied to `photos/`. +- `recover_from_temp`: the temp bucket export still contains the object and it should be copied to the permanent `photos/` prefix. +- `missing`: the object is absent from all supplied exports. Ask the candidate to re-upload; do not mass-clear database fields until the audit is complete. + +## Verification + +Run the fixture-based local check: + +```bash +node tools/check-civil-id-s3-audit.mjs +``` + +The check creates synthetic candidate and S3 inventory exports under the OS temp directory and verifies all four statuses plus the generated copy commands. diff --git a/tools/audit-civil-id-s3-objects.mjs b/tools/audit-civil-id-s3-objects.mjs new file mode 100644 index 00000000..752c3ad1 --- /dev/null +++ b/tools/audit-civil-id-s3-objects.mjs @@ -0,0 +1,377 @@ +#!/usr/bin/env node + +import { readFileSync } from 'node:fs'; +import { basename } from 'node:path'; + +const CURRENT_PREFIX = 'photos/'; +const LEGACY_PREFIX = 'candidate-civil-id/'; +const VALUE_OPTIONS = new Set(['candidates', 'permanentObjects', 'tempObjects', 'format']); + +function printUsage() { + console.log(`Usage: + node tools/audit-civil-id-s3-objects.mjs --candidates candidates.csv --permanent-objects permanent.txt [--temp-objects temp.txt] [--format markdown|csv] [--emit-copy-commands] + +Inputs: + --candidates CSV export with candidate_id, side, filename or expected_s3_key columns + --permanent-objects Text or CSV-style object inventory for studenthub-uploads + --temp-objects Optional object inventory for studenthub-public-anyone-can-upload-24hr-expiry + +The script does not call AWS. It classifies records using offline exports only.`); +} + +function parseArgs(argv) { + const args = { + format: 'markdown', + emitCopyCommands: false, + }; + + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + + if (arg === '--help' || arg === '-h') { + args.help = true; + } else if (arg === '--emit-copy-commands') { + args.emitCopyCommands = true; + } else if (arg.startsWith('--')) { + const key = arg.slice(2).replace(/-([a-z])/g, (_, letter) => letter.toUpperCase()); + const value = argv[i + 1]; + + if (!VALUE_OPTIONS.has(key)) { + throw new Error(`Unknown argument: ${arg}`); + } + + if (!value || value.startsWith('--')) { + throw new Error(`${arg} requires a value`); + } + + args[key] = value; + i += 1; + } else { + throw new Error(`Unexpected argument: ${arg}`); + } + } + + if (args.help) { + return args; + } + + if (!args.candidates || !args.permanentObjects) { + throw new Error('--candidates and --permanent-objects are required'); + } + + if (!['markdown', 'csv'].includes(args.format)) { + throw new Error('--format must be markdown or csv'); + } + + return args; +} + +function parseCsvLine(line) { + const cells = []; + let cell = ''; + let quoted = false; + + for (let i = 0; i < line.length; i += 1) { + const char = line[i]; + const next = line[i + 1]; + + if (quoted && char === '"' && next === '"') { + cell += '"'; + i += 1; + } else if (char === '"') { + quoted = !quoted; + } else if (!quoted && char === ',') { + cells.push(cell); + cell = ''; + } else { + cell += char; + } + } + + cells.push(cell); + return cells.map((value) => value.trim()); +} + +function readLines(file) { + return readFileSync(file, 'utf8') + .split(/\r?\n/) + .map((line) => line.trim()) + .filter((line) => line && !line.startsWith('#')); +} + +function readObjectKeys(file) { + if (!file) { + return new Set(); + } + + const keys = new Set(); + + for (const line of readLines(file)) { + for (const cell of parseCsvLine(line)) { + const value = cell.trim().replace(/^"|"$/g, ''); + + if (value.startsWith(CURRENT_PREFIX) || value.startsWith(LEGACY_PREFIX)) { + keys.add(value); + } + } + } + + return keys; +} + +function readCandidateRows(file) { + const lines = readLines(file); + + if (lines.length === 0) { + return []; + } + + const headers = parseCsvLine(lines[0]).map((header) => header.toLowerCase()); + const rows = []; + + for (const line of lines.slice(1)) { + const values = parseCsvLine(line); + const row = Object.fromEntries(headers.map((header, index) => [header, values[index] ?? ''])); + const candidateId = row.candidate_id || row.candidateid || row.id; + const side = row.side || row.photo_side || row.type || 'unknown'; + const rawFilename = row.filename || row.expected_s3_key || row.s3_key || row.key; + const filename = normalizeCivilIdFilename(rawFilename); + + if (!candidateId || !filename) { + continue; + } + + rows.push({ + candidateId, + side, + filename, + updatedAt: row.candidate_updated_at || row.updated_at || '', + currentKey: `${CURRENT_PREFIX}${filename}`, + legacyKey: `${LEGACY_PREFIX}${filename}`, + }); + } + + return rows; +} + +function normalizeCivilIdFilename(value) { + if (!value) { + return ''; + } + + const normalized = value.trim().replace(/^"|"$/g, '').replace(/^\/+/, ''); + + if (!normalized) { + return ''; + } + + if (normalized.startsWith(CURRENT_PREFIX)) { + return normalized.slice(CURRENT_PREFIX.length); + } + + if (normalized.startsWith(LEGACY_PREFIX)) { + return normalized.slice(LEGACY_PREFIX.length); + } + + return basename(normalized); +} + +function classify(row, permanentKeys, tempKeys) { + if (permanentKeys.has(row.currentKey)) { + return { + status: 'present', + action: 'No action needed', + sourceKey: row.currentKey, + destinationKey: row.currentKey, + }; + } + + if (permanentKeys.has(row.legacyKey)) { + return { + status: 'recover_from_legacy', + action: 'Copy legacy permanent object to photos prefix', + sourceBucket: 'studenthub-uploads', + sourceKey: row.legacyKey, + destinationBucket: 'studenthub-uploads', + destinationKey: row.currentKey, + }; + } + + if (tempKeys.has(row.currentKey) || tempKeys.has(row.filename)) { + const sourceKey = tempKeys.has(row.currentKey) ? row.currentKey : row.filename; + + return { + status: 'recover_from_temp', + action: 'Copy temp object to permanent photos prefix', + sourceBucket: 'studenthub-public-anyone-can-upload-24hr-expiry', + sourceKey, + destinationBucket: 'studenthub-uploads', + destinationKey: row.currentKey, + }; + } + + return { + status: 'missing', + action: 'Request re-upload; do not clear DB field before audit is complete', + sourceKey: '', + destinationKey: row.currentKey, + }; +} + +function summarize(results) { + return results.reduce((summary, result) => { + summary[result.status] = (summary[result.status] ?? 0) + 1; + return summary; + }, {}); +} + +function shellQuote(value) { + return `'${String(value).replaceAll("'", "'\"'\"'")}'`; +} + +function copyCommand(result) { + if (!result.sourceBucket || !result.destinationBucket) { + return ''; + } + + return [ + 'aws', + 's3', + 'cp', + shellQuote(`s3://${result.sourceBucket}/${result.sourceKey}`), + shellQuote(`s3://${result.destinationBucket}/${result.destinationKey}`), + '--only-show-errors', + ].join(' '); +} + +function toCsv(results, includeCommands) { + const headers = [ + 'candidate_id', + 'side', + 'filename', + 'status', + 'action', + 'source_key', + 'destination_key', + 'candidate_updated_at', + ]; + + if (includeCommands) { + headers.push('copy_command'); + } + + const lines = [headers.join(',')]; + + for (const result of results) { + const cells = [ + result.candidateId, + result.side, + result.filename, + result.status, + result.action, + result.sourceKey, + result.destinationKey, + result.updatedAt, + ]; + + if (includeCommands) { + cells.push(copyCommand(result)); + } + + lines.push(cells.map(csvEscape).join(',')); + } + + return lines.join('\n'); +} + +function csvEscape(value) { + const stringValue = String(value ?? ''); + + if (/[",\n]/.test(stringValue)) { + return `"${stringValue.replaceAll('"', '""')}"`; + } + + return stringValue; +} + +function toMarkdown(results, includeCommands) { + const summary = summarize(results); + const lines = [ + '# Civil ID S3 Recovery Audit', + '', + 'This report is generated from offline database and S3 inventory exports. It does not prove live AWS state unless the exports were captured after the latest upload remediation.', + '', + '## Summary', + '', + `- Present in permanent photos prefix: ${summary.present ?? 0}`, + `- Recoverable from legacy permanent prefix: ${summary.recover_from_legacy ?? 0}`, + `- Recoverable from temp bucket export: ${summary.recover_from_temp ?? 0}`, + `- Missing from all supplied exports: ${summary.missing ?? 0}`, + '', + '## Findings', + '', + '| Candidate | Side | Filename | Status | Action | Source | Destination |', + '|-|-|-|-|-|-|-|', + ]; + + for (const result of results) { + lines.push([ + result.candidateId, + result.side, + result.filename, + result.status, + result.action, + result.sourceKey || '-', + result.destinationKey || '-', + ].map(markdownCell).join('|').replace(/^/, '|').replace(/$/, '|')); + + if (includeCommands) { + const command = copyCommand(result); + + if (command) { + lines.push(''); + lines.push(`Copy command for candidate ${result.candidateId} ${result.side}:`); + lines.push(''); + lines.push('```bash'); + lines.push(command); + lines.push('```'); + lines.push(''); + } + } + } + + return lines.join('\n'); +} + +function markdownCell(value) { + return String(value ?? '').replaceAll('|', '\\|'); +} + +function main() { + try { + const args = parseArgs(process.argv.slice(2)); + + if (args.help) { + printUsage(); + return; + } + + const permanentKeys = readObjectKeys(args.permanentObjects); + const tempKeys = readObjectKeys(args.tempObjects); + const rows = readCandidateRows(args.candidates); + const results = rows.map((row) => ({ ...row, ...classify(row, permanentKeys, tempKeys) })); + + if (args.format === 'csv') { + console.log(toCsv(results, args.emitCopyCommands)); + } else { + console.log(toMarkdown(results, args.emitCopyCommands)); + } + } catch (error) { + console.error(`Error: ${error.message}`); + console.error(''); + printUsage(); + process.exitCode = 1; + } +} + +main(); diff --git a/tools/check-civil-id-s3-audit.mjs b/tools/check-civil-id-s3-audit.mjs new file mode 100644 index 00000000..2346affc --- /dev/null +++ b/tools/check-civil-id-s3-audit.mjs @@ -0,0 +1,82 @@ +#!/usr/bin/env node + +import { execFileSync } from 'node:child_process'; +import { mkdtempSync, writeFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import assert from 'node:assert/strict'; +import { fileURLToPath } from 'node:url'; + +const root = fileURLToPath(new URL('..', import.meta.url)); +const script = join(root, 'tools/audit-civil-id-s3-objects.mjs'); +const dir = mkdtempSync(join(tmpdir(), 'civil-id-s3-audit-')); +const candidates = join(dir, 'candidates.csv'); +const permanent = join(dir, 'permanent.txt'); +const temp = join(dir, 'temp.txt'); + +writeFileSync(candidates, [ + 'candidate_id,side,filename,candidate_updated_at', + '100,front,front-present.jpg,2026-05-01', + '101,back,back-legacy.jpg,2026-05-02', + '102,front,front-temp.jpg,2026-05-03', + '103,back,missing.jpg,2026-05-04', + '104,front,photos/already-prefixed.jpg,2026-05-05', +].join('\n')); + +writeFileSync(permanent, [ + 'photos/front-present.jpg', + 'candidate-civil-id/back-legacy.jpg', + 'photos/already-prefixed.jpg', +].join('\n')); + +writeFileSync(temp, [ + 'photos/front-temp.jpg', +].join('\n')); + +const markdown = execFileSync(process.execPath, [ + script, + '--candidates', + candidates, + '--permanent-objects', + permanent, + '--temp-objects', + temp, + '--emit-copy-commands', +], { encoding: 'utf8' }); + +assert.match(markdown, /Present in permanent photos prefix: 2/); +assert.match(markdown, /Recoverable from legacy permanent prefix: 1/); +assert.match(markdown, /Recoverable from temp bucket export: 1/); +assert.match(markdown, /Missing from all supplied exports: 1/); +assert.match(markdown, /aws s3 cp 's3:\/\/studenthub-uploads\/candidate-civil-id\/back-legacy.jpg' 's3:\/\/studenthub-uploads\/photos\/back-legacy.jpg'/); +assert.match(markdown, /aws s3 cp 's3:\/\/studenthub-public-anyone-can-upload-24hr-expiry\/photos\/front-temp.jpg' 's3:\/\/studenthub-uploads\/photos\/front-temp.jpg'/); + +const csv = execFileSync(process.execPath, [ + script, + '--candidates', + candidates, + '--permanent-objects', + permanent, + '--temp-objects', + temp, + '--format', + 'csv', +], { encoding: 'utf8' }); + +assert.match(csv, /101,back,back-legacy.jpg,recover_from_legacy/); +assert.match(csv, /102,front,front-temp.jpg,recover_from_temp/); +assert.match(csv, /103,back,missing.jpg,missing/); + +assert.throws(() => { + execFileSync(process.execPath, [ + script, + '--candidates', + candidates, + '--permanent-objects', + permanent, + '--unknown', + 'value', + ], { encoding: 'utf8', stdio: 'pipe' }); +}, /Unknown argument: --unknown/); + +console.log('Civil ID S3 audit helper check passed.');