diff --git a/CHANGELOG.md b/CHANGELOG.md index 64f40b8..6a464c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ### Removed +## [1.4.4] - 2026-04-22 + +### Added + +- Command line flag to enable processing of a single reporting org. + +### Fixed + +- Stop datasets from unapproved reporting orgs from being processed. + ## [1.4.3] - 2026-03-10 ### Changed diff --git a/pyproject.toml b/pyproject.toml index 4e8e0cf..b7ab0e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "bulk-data-service" -version = "1.4.3" +version = "1.4.4" requires-python = ">= 3.12.6" readme = "README.md" dependencies = [ diff --git a/src/config/bds_context.py b/src/config/bds_context.py index 8763e9c..34a03c5 100644 --- a/src/config/bds_context.py +++ b/src/config/bds_context.py @@ -20,6 +20,7 @@ def __init__(self, environment: dict, logger: logging.Logger, service_factory: I self._RUN_FOR_N_DATASETS = ( int(self["run_for_n_datasets"]) if self.get("run_for_n_datasets") is not None else None ) + self._RUN_FOR_SINGLE_REPORTING_ORG = self.get("run_for_single_reporting_org", None) self._SEND_DATASET_CHECK_MESSAGES = self["SEND_DATASET_CHECK_RESULT_MESSAGES"] == "yes" self._SKIP_SAFETY = self.get("skip_safety", False) @@ -51,6 +52,10 @@ def REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS(self) -> int: def RUN_FOR_N_DATASETS(self) -> int | None: return self._RUN_FOR_N_DATASETS + @property + def RUN_FOR_SINGLE_REPORTING_ORG(self) -> str | None: + return self._RUN_FOR_SINGLE_REPORTING_ORG + @property def SEND_DATASET_CHECK_MESSAGES(self) -> bool: return self._SEND_DATASET_CHECK_MESSAGES diff --git a/src/dataset_registration/iati_registry_suitecrm.py b/src/dataset_registration/iati_registry_suitecrm.py index 9bfcad2..1b9f13c 100644 --- a/src/dataset_registration/iati_registry_suitecrm.py +++ b/src/dataset_registration/iati_registry_suitecrm.py @@ -21,6 +21,8 @@ def fetch_datasets_metadata( crm.fetch_access_token() + context.logger.info("Fetching all dataset metadata using the libsuitecrm library...") + filters = Filter().equal("iati_visibility", "public") suitecrm_dataset_records = [r for r in crm.get_all_records("IATI_Datasets", filters=filters)] @@ -51,6 +53,8 @@ def fetch_datasets_metadata( continue owning_org = reporting_orgs.get(uuid.UUID(record["attributes"]["iati_dataset_owner_org_id"]), None) + if context.RUN_FOR_SINGLE_REPORTING_ORG is not None and owning_org is None: + continue if owning_org is None: context.logger.error( f"SuiteCRM dataset id: {record['id']} has reporting org id: " @@ -63,6 +67,8 @@ def fetch_datasets_metadata( record, owning_org, refresh_timestamp ) + context.logger.info("Fetched metadata for {} datasets".format(len(results))) + return results @@ -72,11 +78,22 @@ def fetch_reporting_orgs_metadata(context: BDSContext, refresh_timestamp: dateti crm.fetch_access_token() - context.logger.info("Fetching all reporting orgs using the libsuitecrm library...") + context.logger.info("Fetching all reporting org metadata using the libsuitecrm library...") - filters = Filter().equal("iati_registry_discoverable", "1") + filters = Filter().equal("iati_registry_discoverable", "1").equal("iati_registry_approved", 1) suitecrm_reporting_org_records = [r for r in crm.get_all_records("Accounts", filters=filters)] + if context.RUN_FOR_SINGLE_REPORTING_ORG is not None: + suitecrm_reporting_org_records = [ + o + for o in suitecrm_reporting_org_records + if o.get("attributes", {}).get("iati_short_name", "") == context.RUN_FOR_SINGLE_REPORTING_ORG + ] + context.logger.info( + "--run-for-single-reporting-org is set so only " + f"processing reporting org '{context.RUN_FOR_SINGLE_REPORTING_ORG}'." + ) + crm.logout() results = {} diff --git a/src/iati_bulk_data_service.py b/src/iati_bulk_data_service.py index 41f4144..a0e7388 100644 --- a/src/iati_bulk_data_service.py +++ b/src/iati_bulk_data_service.py @@ -20,6 +20,7 @@ def main(args: argparse.Namespace): config = config | { "single_run": args.single_run, "run_for_n_datasets": args.run_for_n_datasets, + "run_for_single_reporting_org": args.run_for_single_reporting_org, "skip_safety": args.skip_safety, } @@ -63,6 +64,11 @@ def main(args: argparse.Namespace): type=int, help="Run on the first N datasets from registration service (useful for testing)", ) + parser.add_argument( + "--run-for-single-reporting-org", + type=str, + help="Run only for the datasets belonging to the specified reporting org (useful for testing)", + ) parser.add_argument( "--skip-safety", action="store_true", diff --git a/tests/artifacts/libsuitecrm-responses/reporting-orgs-01-four-orgs.json b/tests/artifacts/libsuitecrm-responses/reporting-orgs-01-four-orgs.json index e3fb524..07a8652 100644 --- a/tests/artifacts/libsuitecrm-responses/reporting-orgs-01-four-orgs.json +++ b/tests/artifacts/libsuitecrm-responses/reporting-orgs-01-four-orgs.json @@ -82,7 +82,7 @@ "iati_org_type": "10", "iati_hq_country": "GB", "iati_region": "89", - "iati_registry_approved": "0", + "iati_registry_approved": "1", "iati_first_publish_date": "", "iati_data_portal_url": "https://www.example.org/data-portal", "iati_exclusions_policy_url": "https://www.example.org/exclusions-policy", @@ -488,7 +488,7 @@ "iati_org_type": "15", "iati_hq_country": "GB", "iati_region": "489", - "iati_registry_approved": "0", + "iati_registry_approved": "1", "iati_first_publish_date": "", "iati_data_portal_url": "https://www.example.org/data-portal", "iati_exclusions_policy_url": "https://www.example.org/exclusions-policy", diff --git a/tests/helpers/azure_service_bus_helpers.py b/tests/helpers/azure_service_bus_helpers.py index d528c16..cff5177 100644 --- a/tests/helpers/azure_service_bus_helpers.py +++ b/tests/helpers/azure_service_bus_helpers.py @@ -1,5 +1,4 @@ import json -from datetime import datetime from uuid import UUID import pytest diff --git a/tests/helpers/data_helpers.py b/tests/helpers/data_helpers.py index a3f6d77..4ed6be4 100644 --- a/tests/helpers/data_helpers.py +++ b/tests/helpers/data_helpers.py @@ -3,8 +3,7 @@ import uuid from config.bds_context import BDSContext -from utilities.azure import get_azure_blob_public_url -from utilities.misc import dataset_has_iati_xml_download, get_object_from_json_str, get_timestamp +from utilities.misc import get_object_from_json_str, get_timestamp def check_most_recent_get_attempt_http_error(dataset: dict): @@ -220,4 +219,6 @@ def expected_values_for_dataset_registration_fields(source_url: str) -> list: def check_registration_service_refreshed_datetime(data_record: dict): assert data_record["registration_service_metadata_refreshed_datetime"] is not None - assert data_record["registration_service_metadata_refreshed_datetime"] > (get_timestamp() - datetime.timedelta(minutes=1)) + assert data_record["registration_service_metadata_refreshed_datetime"] > ( + get_timestamp() - datetime.timedelta(minutes=1) + ) diff --git a/tests/integration/test_dataset_add.py b/tests/integration/test_dataset_add.py index b21a95f..c17226f 100644 --- a/tests/integration/test_dataset_add.py +++ b/tests/integration/test_dataset_add.py @@ -52,7 +52,10 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, source_url, ex @pytest.mark.parametrize( - "dataset_url,last_known_good_dataset_hash,last_known_good_dataset_hash_excluding_generated_timestamp,last_known_good_dataset_content_length", + ( + "dataset_url,last_known_good_dataset_hash,last_known_good_dataset_hash_excluding_generated_timestamp," + "last_known_good_dataset_content_length" + ), [ ( "http://localhost:3000/data/test_foundation_a-dataset-001.xml", diff --git a/tests/integration/test_dataset_expiry.py b/tests/integration/test_dataset_expiry.py index b6c9aa5..d90bb8c 100644 --- a/tests/integration/test_dataset_expiry.py +++ b/tests/integration/test_dataset_expiry.py @@ -25,8 +25,9 @@ def test_dataset_expiry_after_72_hours_failed_downloads(get_and_clear_up_context assert get_number_xml_files_in_working_dir(context) == 1 dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")] - dataset["last_known_good_dataset_downloaded"] = (dataset["last_known_good_dataset_downloaded"] - - timedelta(hours=max_hours + 2)) + dataset["last_known_good_dataset_downloaded"] = dataset["last_known_good_dataset_downloaded"] - timedelta( + hours=max_hours + 2 + ) context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/ckan-registration/datasets-03-1-dataset-404" checker_run(context, datasets_in_bds) diff --git a/tests/integration/test_dataset_indexing.py b/tests/integration/test_dataset_indexing.py index 3820e3f..2e39b3d 100644 --- a/tests/integration/test_dataset_indexing.py +++ b/tests/integration/test_dataset_indexing.py @@ -15,7 +15,7 @@ from helpers.helpers import download_index_from_azure, get_and_clear_up_context # noqa: F401 from utilities.azure import get_azure_container_name from utilities.db import get_reporting_orgs_in_bds -from utilities.misc import find_object_by_key, format_timestamp_as_utc_str +from utilities.misc import find_object_by_key def test_indices_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811 diff --git a/tests/integration/test_dataset_registration.py b/tests/integration/test_dataset_registration.py index 0038c3c..8cf719d 100644 --- a/tests/integration/test_dataset_registration.py +++ b/tests/integration/test_dataset_registration.py @@ -9,7 +9,7 @@ from dataset_registration.iati_registry_ckan import get_publisher_metadata_as_str from dataset_registration.registration_proxy import fetch_datasets_metadata, fetch_reporting_orgs_metadata from helpers.helpers import get_and_clear_up_context # noqa: F401 -from utilities.misc import find_object_by_key, get_timestamp +from utilities.misc import get_timestamp @pytest.mark.parametrize("http_status_code", ["400", "404", "500"]) @@ -139,7 +139,7 @@ def test_suitecrm_registry_conversion_of_registry_reporting_orgs(get_and_clear_u assert ro_1["default_licence_id"] == "gpl-3.0" assert ro_1["description"] == "Eaque eaque nostrum quia illum ipsum." assert ro_1["exclusions_policy_url"] == "https://www.example.org/exclusions-policy" - assert ro_1["first_publication_date"] == None + assert ro_1["first_publication_date"] is None assert ro_1["hq_country"] == "GB" assert ro_1["human_readable_name"] == "Gov Agency 1234" assert ro_1["organisation_identifier"] == "GOV-AGENCY-AID-1234" diff --git a/tests/integration/test_dataset_update.py b/tests/integration/test_dataset_update.py index a96e903..feb93ae 100644 --- a/tests/integration/test_dataset_update.py +++ b/tests/integration/test_dataset_update.py @@ -362,4 +362,4 @@ def test_dataset_successful_twice_after_url_change(get_and_clear_up_context): # # run again checker_run(context, datasets_in_bds) - check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id]) \ No newline at end of file + check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id]) diff --git a/tests/integration/test_db.py b/tests/integration/test_db.py index 5b601ea..487cf34 100644 --- a/tests/integration/test_db.py +++ b/tests/integration/test_db.py @@ -12,7 +12,7 @@ ) -def test_save_reporting_org_db_record(get_and_clear_up_context): +def test_save_reporting_org_db_record(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context @@ -51,7 +51,7 @@ def test_save_reporting_org_db_record(get_and_clear_up_context): assert reporting_org_from_db == reporting_org -def test_save_dataset_db_record(get_and_clear_up_context): +def test_save_dataset_db_record(get_and_clear_up_context): # noqa: F811 context = get_and_clear_up_context diff --git a/tests/integration/test_mq_registry_dataset_changes.py b/tests/integration/test_mq_registry_dataset_changes.py index 73bb1bf..b6ed6f6 100644 --- a/tests/integration/test_mq_registry_dataset_changes.py +++ b/tests/integration/test_mq_registry_dataset_changes.py @@ -48,7 +48,6 @@ async def test_dataset_created_message_01_success(get_and_clear_up_context, serv check_registration_service_refreshed_datetime(datasets_in_bds[dataset_id]) - @pytest.mark.asyncio async def test_dataset_created_message_02_error_dataset_already_exists( get_and_clear_up_context, service_bus_context # noqa: F811 diff --git a/tests/integration/test_zip_creation.py b/tests/integration/test_zip_creation.py index a84c48d..02217b7 100644 --- a/tests/integration/test_zip_creation.py +++ b/tests/integration/test_zip_creation.py @@ -2,7 +2,6 @@ import os import zipfile -import pytest import requests from bulk_data_service.checker import checker_run @@ -18,9 +17,14 @@ def test_dataset_saved_for_download_success(get_and_clear_up_context): # noqa: run_checker_then_zipper_once(context) assert get_number_xml_files_in_working_dir(context) == 1 - assert os.path.exists("{}{}".format( - context["ZIP_WORKING_DIR"], - "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True + assert ( + os.path.exists( + "{}{}".format( + context["ZIP_WORKING_DIR"], "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) + ) + is True + ) def test_dataset_not_saved_for_download_fail_and_no_cache(get_and_clear_up_context): # noqa: F811 @@ -39,9 +43,14 @@ def test_dataset_saved_for_download_fail_but_cached(get_and_clear_up_context): run_checker_then_zipper_download_fail_but_cached(context) assert get_number_xml_files_in_working_dir(context) == 1 - assert os.path.exists("{}{}".format( - context["ZIP_WORKING_DIR"], - "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml")) is True + assert ( + os.path.exists( + "{}{}".format( + context["ZIP_WORKING_DIR"], "/iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) + ) + is True + ) def test_publisher_metadata_saved_for_failed_metadata_dl(get_and_clear_up_context): # noqa: F811 @@ -106,14 +115,21 @@ def test_dataset_metadata_content_for_successful_metadata_dl(get_and_clear_up_co download_and_unpack_zip_to_tmp_unpack_folder(context, "code-for-iati-data-download.zip") - with open(context["TEST_TMP_ZIP_UNPACK"] + "/iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001-newname.json", "r") as f: + with open( + context["TEST_TMP_ZIP_UNPACK"] + + "/iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001-newname.json", + "r", + ) as f: assert f.read() == json.dumps( { "id": "c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159", "license_id": "uk-ogl", "license_title": "UK Open Government Licence (OGL)", "name": "test_foundation_a-dataset-001-newname", - "organization": {"id": "ea055d99-f7e9-456f-9f99-963e95493c1b", "name": "test_foundation_a", }, + "organization": { + "id": "ea055d99-f7e9-456f-9f99-963e95493c1b", + "name": "test_foundation_a", + }, "resources": [{"url": "http://localhost:3000/not_found"}], "extras": [], "tags": [], @@ -134,7 +150,9 @@ def test_bds_zip_content_for_download_success(get_and_clear_up_context): # noqa assert file_found_in_extracted_zip(context, "iati-data/datasets-minimal.json") assert file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") - assert file_found_in_extracted_zip(context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml") + assert file_found_in_extracted_zip( + context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) def test_bds_zip_content_for_download_success_dataset_updated_meta(get_and_clear_up_context, tmp_path): # noqa: F811 @@ -161,10 +179,14 @@ def test_bds_zip_content_for_download_success_dataset_updated_meta(get_and_clear assert file_found_in_extracted_zip(context, "iati-data/datasets-minimal.json") assert file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") - assert file_found_in_extracted_zip(context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001-newname.xml") + assert file_found_in_extracted_zip( + context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001-newname.xml" + ) # The dataset as it was originally named should not be found - assert not file_found_in_extracted_zip(context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml") + assert not file_found_in_extracted_zip( + context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) def test_bds_zip_content_for_download_success_dataset_updated_content(get_and_clear_up_context): # noqa: F811 @@ -193,7 +215,12 @@ def test_bds_zip_content_for_download_success_dataset_updated_content(get_and_cl assert file_found_in_extracted_zip(context, "iati-data/datasets-minimal.json") - with open(os.path.join(context["TEST_TMP_ZIP_UNPACK"], "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml"), "rb") as f: + with open( + os.path.join( + context["TEST_TMP_ZIP_UNPACK"], "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ), + "rb", + ) as f: contents_from_zip = f.read() with open("tests/artifacts/iati-xml-files/test_foundation_a-dataset-001-updated.xml", "rb") as f: @@ -213,7 +240,9 @@ def test_bds_zip_content_for_download_fail_but_cached(get_and_clear_up_context): assert file_found_in_extracted_zip(context, "iati-data/datasets-minimal.json") assert file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") - assert file_found_in_extracted_zip(context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml") + assert file_found_in_extracted_zip( + context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) def test_bds_zip_content_for_download_fail_no_cached(get_and_clear_up_context): # noqa: F811 @@ -227,7 +256,9 @@ def test_bds_zip_content_for_download_fail_no_cached(get_and_clear_up_context): assert file_found_in_extracted_zip(context, "iati-data/datasets-minimal.json") assert file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") - assert not file_found_in_extracted_zip(context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml") + assert not file_found_in_extracted_zip( + context, "iati-data/datasets/test_foundation_a/test_foundation_a-dataset-001.xml" + ) def test_codeforiati_zip_content_for_download_success(get_and_clear_up_context): # noqa: F811 @@ -242,9 +273,13 @@ def test_codeforiati_zip_content_for_download_success(get_and_clear_up_context): assert not file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert not file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") assert file_found_in_extracted_zip(context, "iati-data-main/metadata.json") - assert file_found_in_extracted_zip(context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml") + assert file_found_in_extracted_zip( + context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" + ) assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a.json") - assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json") + assert file_found_in_extracted_zip( + context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" + ) def test_codeforiati_zip_content_for_download_fail_but_cached(get_and_clear_up_context): # noqa: F811 @@ -259,9 +294,13 @@ def test_codeforiati_zip_content_for_download_fail_but_cached(get_and_clear_up_c assert not file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert not file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") assert file_found_in_extracted_zip(context, "iati-data-main/metadata.json") - assert file_found_in_extracted_zip(context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml") + assert file_found_in_extracted_zip( + context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" + ) assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a.json") - assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json") + assert file_found_in_extracted_zip( + context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" + ) def test_codeforiati_zip_content_for_download_fail_no_cached(get_and_clear_up_context): # noqa: F811 @@ -276,11 +315,23 @@ def test_codeforiati_zip_content_for_download_fail_no_cached(get_and_clear_up_co assert not file_found_in_extracted_zip(context, "iati-data/datasets-full.json") assert not file_found_in_extracted_zip(context, "iati-data/reporting-orgs.json") assert file_found_in_extracted_zip(context, "iati-data-main/metadata.json") - assert file_found_in_extracted_zip(context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml") + assert file_found_in_extracted_zip( + context, "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml" + ) assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a.json") - assert file_found_in_extracted_zip(context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json") + assert file_found_in_extracted_zip( + context, "iati-data-main/metadata/test_foundation_a/test_foundation_a-dataset-001.json" + ) - assert os.path.getsize(os.path.join(context["TEST_TMP_ZIP_UNPACK"], "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml")) == 0 + assert ( + os.path.getsize( + os.path.join( + context["TEST_TMP_ZIP_UNPACK"], + "iati-data-main/data/test_foundation_a/test_foundation_a-dataset-001.xml", + ) + ) + == 0 + ) def run_checker_then_zipper(context, registry_url: str, datasets_in_bds: dict, datasets_in_zip: dict):