diff --git a/.github/workflows/build-and-deploy-job.yml b/.github/workflows/build-and-deploy-job.yml index 739883a..11add6f 100644 --- a/.github/workflows/build-and-deploy-job.yml +++ b/.github/workflows/build-and-deploy-job.yml @@ -87,42 +87,41 @@ jobs: - name: 'Replace Env Vars and Secrets in ARM Yaml template' env: - # Credentials for the app's resources + # Secrets / credentials for the app's resources + AZURE_SERVICE_BUS_CONNECTION_STRING: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_CONNECTION_STRING')] }} AZURE_STORAGE_CONNECTION_STRING: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_CONNECTION_STRING')] }} + DB_CONNECTION_TIMEOUT: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_CONNECTION_TIMEOUT')] }} DB_HOST: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_HOST')] }} - DB_USER: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_USER')] }} - DB_PASS: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_PASS')] }} DB_NAME: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_NAME')] }} + DB_PASS: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_PASS')] }} DB_PORT: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_PORT')] }} DB_SSL_MODE: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_SSL_MODE')] }} - DB_CONNECTION_TIMEOUT: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_CONNECTION_TIMEOUT')] }} + DB_USER: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DB_USER')] }} LOG_WORKSPACE_ID: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_ID')] }} LOG_WORKSPACE_KEY: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'LOG_WORKSPACE_KEY')] }} # Variables which configure the app + AZURE_SERVICE_BUS_REGISTRY_SUB_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_SUB_NAME')] }} + AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME')] }} + AZURE_SERVICE_BUS_WAIT_TIME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_WAIT_TIME')] }} + AZURE_STORAGE_BLOB_CONTAINER_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME')] }} + CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }} + DATASET_GET_TIMEOUT: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATASET_GET_TIMEOUT')] }} + DATASET_HEAD_TIMEOUT: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATASET_HEAD_TIMEOUT')] }} DATA_REGISTRATION: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRATION')] }} DATA_REGISTRY_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_BASE_URL')] }} - DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL')] }} - DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }} DATA_REGISTRY_PUBLISHER_METADATA_BATCH_SIZE: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_BATCH_SIZE')] }} - WEB_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'WEB_BASE_URL')] }} - NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }} + DATA_REGISTRY_PUBLISHER_METADATA_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_METADATA_URL')] }} + DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATA_REGISTRY_PUBLISHER_PLAIN_LIST_URL')] }} FORCE_REDOWNLOAD_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'FORCE_REDOWNLOAD_AFTER_HOURS')] }} + NUMBER_DOWNLOADER_THREADS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'NUMBER_DOWNLOADER_THREADS')] }} REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS')] }} - DATASET_HEAD_TIMEOUT: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATASET_HEAD_TIMEOUT')] }} - DATASET_GET_TIMEOUT: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'DATASET_GET_TIMEOUT')] }} - SEND_DATASET_CHECK_RESULT_MESSAGES: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'SEND_DATASET_CHECK_RESULT_MESSAGES')] }} REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'REMOVE_LAST_GOOD_DOWNLOAD_AFTER_FAILING_HOURS')] }} + SEND_DATASET_CHECK_RESULT_MESSAGES: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'SEND_DATASET_CHECK_RESULT_MESSAGES')] }} + WEB_BASE_URL: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'WEB_BASE_URL')] }} ZIP_WORKING_DIR: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'ZIP_WORKING_DIR')] }} - AZURE_STORAGE_BLOB_CONTAINER_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_STORAGE_BLOB_CONTAINER_NAME')] }} - CHECKER_LOOP_WAIT_MINS: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'CHECKER_LOOP_WAIT_MINS')] }} - - AZURE_SERVICE_BUS_CONNECTION_STRING: ${{ secrets[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_CONNECTION_STRING')] }} - AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_TOPIC_NAME')] }} - AZURE_SERVICE_BUS_REGISTRY_SUB_NAME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_REGISTRY_SUB_NAME')] }} - AZURE_SERVICE_BUS_WAIT_TIME: ${{ vars[format('{0}_{1}', env.TARGET_ENVIRONMENT_UPPER, 'AZURE_SERVICE_BUS_WAIT_TIME')] }} run: | ./azure-deployment/generate-manifest-from-template.sh diff --git a/.vscode/launch.json b/.vscode/launch.json index 54e0a60..2463444 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,7 +5,7 @@ "version": "0.2.0", "configurations": [ { - "name": "Python Debugger: Bulk Data Service - Checker - Single Run", + "name": "Bulk Data Service - Checker - Single Run", "type": "debugpy", "request": "launch", "program": "src/iati_bulk_data_service.py", @@ -20,7 +20,7 @@ "envFile": "${workspaceFolder}/.env" }, { - "name": "Python Debugger: Bulk Data Service - Zipper - Single Run", + "name": "Bulk Data Service - Zipper - Single Run", "type": "debugpy", "request": "launch", "program": "src/iati_bulk_data_service.py", @@ -33,7 +33,21 @@ "envFile": "${workspaceFolder}/.env" }, { - "name": "Python Debugger: Bulk Data Service - Checker & Zipper Loop", + "name": "Bulk Data Service - Checker & Zipper Loop (15 datasets)", + "type": "debugpy", + "request": "launch", + "program": "src/iati_bulk_data_service.py", + "args": [ + "--operation", + "checker", + "--run-for-n-datasets", + "15" + ], + "console": "integratedTerminal", + "envFile": "${workspaceFolder}/.env" + }, + { + "name": "Bulk Data Service - Checker & Zipper Loop (75 datasets)", "type": "debugpy", "request": "launch", "program": "src/iati_bulk_data_service.py", @@ -47,7 +61,19 @@ "envFile": "${workspaceFolder}/.env" }, { - "name": "Python Debugger: Bulk Data Service - Registry Changes Processor", + "name": "Bulk Data Service - Checker & Zipper Loop", + "type": "debugpy", + "request": "launch", + "program": "src/iati_bulk_data_service.py", + "args": [ + "--operation", + "checker" + ], + "console": "integratedTerminal", + "envFile": "${workspaceFolder}/.env" + }, + { + "name": "Bulk Data Service - Registry Changes Processor", "type": "debugpy", "request": "launch", "program": "src/iati_bulk_data_service.py", diff --git a/pyproject.toml b/pyproject.toml index 5c2f8f6..7bc6ac1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.3.2" +version = "1.3.3" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ diff --git a/src/bulk_data_service/dataset_updater.py b/src/bulk_data_service/dataset_updater.py index ee2fb06..d774eb9 100644 --- a/src/bulk_data_service/dataset_updater.py +++ b/src/bulk_data_service/dataset_updater.py @@ -16,7 +16,7 @@ update_dataset_http_attempt_fields_as_success, ) from config.bds_context import BDSContext -from utilities.azure import azure_upload_to_blob_and_verify, send_dataset_check_result_message +from utilities.azure import azure_get_blob_etag, azure_upload_to_blob_and_verify, send_dataset_check_result_message from utilities.db import get_db_connection, insert_or_update_dataset from utilities.http import ( determine_response_encoding, @@ -354,20 +354,29 @@ def download_and_save_dataset( return hash = get_hash_of_bytes(download_response.content) + hash_excluding_generated = get_hash_excluding_generated_timestamp(download_response.text, encoding) # type: ignore - if hash == bds_dataset["last_known_good_dataset_hash"]: + xml_blob_name = "{}/{}.xml".format(bds_dataset["reporting_org_short_name"], bds_dataset["short_name"]) + + zip_blob_name = "{}/{}.zip".format(bds_dataset["reporting_org_short_name"], bds_dataset["short_name"]) + + xml_blob_etag = azure_get_blob_etag(context, az_blob_service, xml_blob_name) + + zip_blob_etag = azure_get_blob_etag(context, az_blob_service, zip_blob_name) + + if ( + hash == bds_dataset["last_known_good_dataset_hash"] + and xml_blob_etag == bds_dataset["last_known_good_dataset_cached_dataset_xml_etag"] + and zip_blob_etag == bds_dataset["last_known_good_dataset_cached_dataset_zip_etag"] + ): context.logger.info( - "dataset id: {} - Hash of download is identical to " - "previous value, so not re-zipping and re-uploading to Azure".format(bds_dataset["id"]) + "dataset id: {} - Hash and Azure blob etags are the same" + ", so not re-zipping and re-uploading to Azure".format(bds_dataset["id"]) ) else: iati_xml_zipped = zip_data_as_single_file(bds_dataset["short_name"] + ".xml", download_response.content) - xml_blob_name = "{}/{}.xml".format(bds_dataset["reporting_org_short_name"], bds_dataset["short_name"]) - - zip_blob_name = "{}/{}.zip".format(bds_dataset["reporting_org_short_name"], bds_dataset["short_name"]) - cached_xml_url, cached_xml_etag = azure_upload_to_blob_and_verify( context, bds_dataset, diff --git a/src/utilities/azure.py b/src/utilities/azure.py index 650a9ae..4c17f4e 100644 --- a/src/utilities/azure.py +++ b/src/utilities/azure.py @@ -12,8 +12,10 @@ def azure_blob_exists(az_blob_service: BlobServiceClient, container_name: str, blob_name: str) -> bool: - blob_client = az_blob_service.get_blob_client(container_name, blob_name) - return blob_client.exists() + exists = False + with az_blob_service.get_blob_client(container_name, blob_name) as blob_client: + exists = blob_client.exists() + return exists def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, blob_name: str, filename: str): @@ -27,6 +29,20 @@ def azure_download_blob(az_blob_service: BlobServiceClient, container_name: str, blob_client.close() +def azure_get_blob_etag( + context: BDSContext, + az_blob_service: BlobServiceClient, + blob_name: str, +) -> str: + etag = "" + + with az_blob_service.get_blob_client(context["AZURE_STORAGE_BLOB_CONTAINER_NAME"], blob_name) as blob_client: + if blob_client.exists(): + etag = blob_client.get_blob_properties().etag + + return etag + + def azure_upload_to_blob_and_verify( context: BDSContext, bds_dataset: dict,