From 97577ac509d2f7649424d7b83d61ae4a018ff59e Mon Sep 17 00:00:00 2001 From: William Joy Date: Fri, 15 May 2026 19:11:37 -0700 Subject: [PATCH] Harden Textract Civil ID expiry extraction --- common/components/IdExpiryDateExtractor.php | 83 +++++++++++++++++++-- console/controllers/CronController.php | 8 +- docs/textract-civil-id-extractor.md | 19 +++++ tests/check-textract-civil-id-extractor.py | 35 +++++++++ 4 files changed, 136 insertions(+), 9 deletions(-) create mode 100644 docs/textract-civil-id-extractor.md create mode 100644 tests/check-textract-civil-id-extractor.py diff --git a/common/components/IdExpiryDateExtractor.php b/common/components/IdExpiryDateExtractor.php index dfdec9cf..07536e66 100644 --- a/common/components/IdExpiryDateExtractor.php +++ b/common/components/IdExpiryDateExtractor.php @@ -6,7 +6,6 @@ use Aws\Textract\TextractClient; use Aws\Exception\AwsException; use yii\base\Component; -use yii\base\InvalidConfigException; /** * $dates = Yii::$app->idExpiryDateExtractor->extractExpiryDate($documentName); @@ -15,8 +14,11 @@ class IdExpiryDateExtractor extends Component { private $textractClient; + private $configurationError; public $version = 'latest'; + public $region; + public $bucket; /** * @var string Amazon access key @@ -35,8 +37,21 @@ public function init() { parent::init(); + $region = $this->region ?: $this->getResourceManagerProperty('region'); + if (!$region || !$this->bucket && !$this->getResourceManagerProperty('bucket')) { + $this->configurationError = 'Textract document bucket configuration is missing.'; + Yii::warning($this->configurationError, __METHOD__); + return; + } + + if (!$this->key || !$this->secret) { + $this->configurationError = 'Textract credentials are not configured.'; + Yii::warning($this->configurationError, __METHOD__); + return; + } + $this->textractClient = new TextractClient([ - 'region' => Yii::$app->resourceManager->region, + 'region' => $region, 'version' => $this->version, 'credentials' => [ 'key' => $this->key, @@ -51,11 +66,26 @@ public function init() */ public function extractExpiryDate($documentName) { + if ($this->configurationError) { + return $this->errorResponse('Textract is not configured.'); + } + + if (!$this->isSafeDocumentName($documentName)) { + Yii::warning("Rejected unsafe Textract document name.", __METHOD__); + return $this->errorResponse('Invalid document reference.'); + } + + $bucket = $this->bucket ?: $this->getResourceManagerProperty('bucket'); + if (!$bucket) { + Yii::warning("Textract document bucket is not configured.", __METHOD__); + return $this->errorResponse('Textract is not configured.'); + } + try { $result = $this->textractClient->detectDocumentText([ 'Document' => [ 'S3Object' => [ - 'Bucket' => Yii::$app->resourceManager->bucket, + 'Bucket' => $bucket, 'Name' => $documentName ] ] @@ -104,12 +134,51 @@ public function extractExpiryDate($documentName) ]; } catch (AwsException $e) { - return [ - "operation" => "error", - "matches" => $e->getMessage() - ]; + Yii::error(sprintf( + 'Textract detectDocumentText failed: %s', + $e->getAwsErrorCode() ?: get_class($e) + ), __METHOD__); + + return $this->errorResponse('Unable to read document text.'); + } catch (\Throwable $e) { + Yii::error(sprintf( + 'Textract expiry extraction failed: %s', + get_class($e) + ), __METHOD__); + + return $this->errorResponse('Unable to read document text.'); } } + + private function getResourceManagerProperty($property) + { + $resourceManager = Yii::$app->get('resourceManager', false); + + return $resourceManager && isset($resourceManager->$property) + ? $resourceManager->$property + : null; + } + + private function isSafeDocumentName($documentName) + { + if (!is_string($documentName) || $documentName === '') { + return false; + } + + if (preg_match('/(^\/|^\w+:|\\\\|\.\.)/', $documentName)) { + return false; + } + + return str_starts_with($documentName, 'photos/'); + } + + private function errorResponse($message) + { + return [ + "operation" => "error", + "matches" => $message + ]; + } } diff --git a/console/controllers/CronController.php b/console/controllers/CronController.php index 92fc781f..b083fa06 100644 --- a/console/controllers/CronController.php +++ b/console/controllers/CronController.php @@ -159,13 +159,17 @@ public function actionValidateCivilId() { $response = Yii::$app->idExpiryDateExtractor ->extractExpiryDate("photos/" . $candidate->candidate_civil_photo_front); - if ($response['operation'] == "success" ) { + if ($response['operation'] == "success" && !empty($response['matches'])) { $date = array_pop($response['matches']); - $dateTime = $date? strtotime(str_replace("/", "-", $date)): time(); + $dateTime = strtotime(str_replace("/", "-", $date)); //$date = end($response['matches']); + if (!$dateTime) { + continue; + } + /*if($candidate->candidate_civil_expiry_date && $dateTime <= strtotime($candidate->candidate_civil_expiry_date)) { continue; diff --git a/docs/textract-civil-id-extractor.md b/docs/textract-civil-id-extractor.md new file mode 100644 index 00000000..d2dcfb80 --- /dev/null +++ b/docs/textract-civil-id-extractor.md @@ -0,0 +1,19 @@ +# Textract Civil ID Expiry Extractor + +`common\components\IdExpiryDateExtractor` reads Civil ID images from the permanent S3 bucket and sends them to AWS Textract. + +## Required runtime configuration + +Set these values in the runtime secret store. Do not commit real key values. + +- `AWS_TEXTRACT_ACCESS_KEY_ID` +- `AWS_TEXTRACT_SECRET_ACCESS_KEY` + +The extractor reuses the configured permanent S3 bucket and region from `resourceManager` unless `bucket` or `region` are explicitly set on the component. + +## Safety behavior + +- Missing Textract credentials or bucket configuration returns `operation: error` without creating an AWS client. +- Only `photos/` S3 object keys are accepted for Textract reads. +- AWS/provider exceptions are logged by exception type or AWS error code and are not returned to API callers. +- The Civil ID validation cron only updates an expiry date when Textract returns an actual date match. diff --git a/tests/check-textract-civil-id-extractor.py b/tests/check-textract-civil-id-extractor.py new file mode 100644 index 00000000..c6569a7b --- /dev/null +++ b/tests/check-textract-civil-id-extractor.py @@ -0,0 +1,35 @@ +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + + +def read(path): + return (ROOT / path).read_text(encoding="utf-8") + + +def require(condition, message): + if not condition: + raise SystemExit(message) + + +extractor = read("common/components/IdExpiryDateExtractor.php") +cron = read("console/controllers/CronController.php") +config = read("common/config/main.php") +docs = read("docs/textract-civil-id-extractor.md") + +require("AWS_TEXTRACT_ACCESS_KEY_ID" in config, "Textract access key must stay env-backed.") +require("AWS_TEXTRACT_SECRET_ACCESS_KEY" in config, "Textract secret key must stay env-backed.") + +require("configurationError" in extractor, "Extractor must fail closed when config is missing.") +require("isSafeDocumentName" in extractor, "Extractor must validate S3 object names before Textract reads.") +require("str_starts_with($documentName, 'photos/')" in extractor, "Extractor must restrict reads to Civil ID photo keys.") +require("$e->getMessage()" not in extractor, "Extractor must not return raw AWS/provider exception messages.") +require("Unable to read document text." in extractor, "Extractor must return a sanitized OCR failure message.") + +require('$response[\'operation\'] == "success" && !empty($response[\'matches\'])' in cron, "Cron must only update expiry dates when OCR returns date matches.") +require(": time()" not in cron, "Cron must not default missing OCR dates to the current time.") + +require("Only `photos/` S3 object keys" in docs, "Textract hardening docs must describe the S3 key boundary.") + +print("Textract Civil ID extractor hardening check passed.")