Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 76 additions & 7 deletions common/components/IdExpiryDateExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
use Aws\Textract\TextractClient;
use Aws\Exception\AwsException;
use yii\base\Component;
use yii\base\InvalidConfigException;

/**
* $dates = Yii::$app->idExpiryDateExtractor->extractExpiryDate($documentName);
Expand All @@ -15,8 +14,11 @@
class IdExpiryDateExtractor extends Component
{
private $textractClient;
private $configurationError;

public $version = 'latest';
public $region;
public $bucket;

/**
* @var string Amazon access key
Expand All @@ -35,8 +37,21 @@ public function init()
{
parent::init();

$region = $this->region ?: $this->getResourceManagerProperty('region');
if (!$region || !$this->bucket && !$this->getResourceManagerProperty('bucket')) {
$this->configurationError = 'Textract document bucket configuration is missing.';
Yii::warning($this->configurationError, __METHOD__);
return;
}

if (!$this->key || !$this->secret) {
$this->configurationError = 'Textract credentials are not configured.';
Yii::warning($this->configurationError, __METHOD__);
return;
}

$this->textractClient = new TextractClient([
'region' => Yii::$app->resourceManager->region,
'region' => $region,
'version' => $this->version,
'credentials' => [
'key' => $this->key,
Expand All @@ -51,11 +66,26 @@ public function init()
*/
public function extractExpiryDate($documentName)
{
if ($this->configurationError) {
return $this->errorResponse('Textract is not configured.');
}

if (!$this->isSafeDocumentName($documentName)) {
Yii::warning("Rejected unsafe Textract document name.", __METHOD__);
return $this->errorResponse('Invalid document reference.');
}

$bucket = $this->bucket ?: $this->getResourceManagerProperty('bucket');
if (!$bucket) {
Yii::warning("Textract document bucket is not configured.", __METHOD__);
return $this->errorResponse('Textract is not configured.');
}

try {
$result = $this->textractClient->detectDocumentText([
'Document' => [
'S3Object' => [
'Bucket' => Yii::$app->resourceManager->bucket,
'Bucket' => $bucket,
'Name' => $documentName
]
]
Expand Down Expand Up @@ -104,12 +134,51 @@ public function extractExpiryDate($documentName)
];

} catch (AwsException $e) {
return [
"operation" => "error",
"matches" => $e->getMessage()
];
Yii::error(sprintf(
'Textract detectDocumentText failed: %s',
$e->getAwsErrorCode() ?: get_class($e)
), __METHOD__);

return $this->errorResponse('Unable to read document text.');
} catch (\Throwable $e) {
Yii::error(sprintf(
'Textract expiry extraction failed: %s',
get_class($e)
), __METHOD__);

return $this->errorResponse('Unable to read document text.');
}
}

private function getResourceManagerProperty($property)
{
$resourceManager = Yii::$app->get('resourceManager', false);

return $resourceManager && isset($resourceManager->$property)
? $resourceManager->$property
: null;
}

private function isSafeDocumentName($documentName)
{
if (!is_string($documentName) || $documentName === '') {
return false;
}

if (preg_match('/(^\/|^\w+:|\\\\|\.\.)/', $documentName)) {
return false;
}

return str_starts_with($documentName, 'photos/');
}

private function errorResponse($message)
{
return [
"operation" => "error",
"matches" => $message
];
}
}


Expand Down
8 changes: 6 additions & 2 deletions console/controllers/CronController.php
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,17 @@ public function actionValidateCivilId() {
$response = Yii::$app->idExpiryDateExtractor
->extractExpiryDate("photos/" . $candidate->candidate_civil_photo_front);

if ($response['operation'] == "success" ) {
if ($response['operation'] == "success" && !empty($response['matches'])) {

$date = array_pop($response['matches']);

$dateTime = $date? strtotime(str_replace("/", "-", $date)): time();
$dateTime = strtotime(str_replace("/", "-", $date));
//$date = end($response['matches']);

if (!$dateTime) {
continue;
}

/*if($candidate->candidate_civil_expiry_date &&
$dateTime <= strtotime($candidate->candidate_civil_expiry_date)) {
continue;
Expand Down
19 changes: 19 additions & 0 deletions docs/textract-civil-id-extractor.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Textract Civil ID Expiry Extractor

`common\components\IdExpiryDateExtractor` reads Civil ID images from the permanent S3 bucket and sends them to AWS Textract.

## Required runtime configuration

Set these values in the runtime secret store. Do not commit real key values.

- `AWS_TEXTRACT_ACCESS_KEY_ID`
- `AWS_TEXTRACT_SECRET_ACCESS_KEY`

The extractor reuses the configured permanent S3 bucket and region from `resourceManager` unless `bucket` or `region` are explicitly set on the component.

## Safety behavior

- Missing Textract credentials or bucket configuration returns `operation: error` without creating an AWS client.
- Only `photos/` S3 object keys are accepted for Textract reads.
- AWS/provider exceptions are logged by exception type or AWS error code and are not returned to API callers.
- The Civil ID validation cron only updates an expiry date when Textract returns an actual date match.
35 changes: 35 additions & 0 deletions tests/check-textract-civil-id-extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]


def read(path):
return (ROOT / path).read_text(encoding="utf-8")


def require(condition, message):
if not condition:
raise SystemExit(message)


extractor = read("common/components/IdExpiryDateExtractor.php")
cron = read("console/controllers/CronController.php")
config = read("common/config/main.php")
docs = read("docs/textract-civil-id-extractor.md")

require("AWS_TEXTRACT_ACCESS_KEY_ID" in config, "Textract access key must stay env-backed.")
require("AWS_TEXTRACT_SECRET_ACCESS_KEY" in config, "Textract secret key must stay env-backed.")

require("configurationError" in extractor, "Extractor must fail closed when config is missing.")
require("isSafeDocumentName" in extractor, "Extractor must validate S3 object names before Textract reads.")
require("str_starts_with($documentName, 'photos/')" in extractor, "Extractor must restrict reads to Civil ID photo keys.")
require("$e->getMessage()" not in extractor, "Extractor must not return raw AWS/provider exception messages.")
require("Unable to read document text." in extractor, "Extractor must return a sanitized OCR failure message.")

require('$response[\'operation\'] == "success" && !empty($response[\'matches\'])' in cron, "Cron must only update expiry dates when OCR returns date matches.")
require(": time()" not in cron, "Cron must not default missing OCR dates to the current time.")

require("Only `photos/` S3 object keys" in docs, "Textract hardening docs must describe the S3 key boundary.")

print("Textract Civil ID extractor hardening check passed.")