Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

### Removed

## [1.4.4] - 2026-04-22

### Added

- Command line flag to enable processing of a single reporting org.

### Fixed

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a change for the command line argument addition?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

- Stop datasets from unapproved reporting orgs from being processed.

## [1.4.3] - 2026-03-10

### Changed
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "bulk-data-service"
version = "1.4.3"
version = "1.4.4"
requires-python = ">= 3.12.6"
readme = "README.md"
dependencies = [
Expand Down
5 changes: 5 additions & 0 deletions src/config/bds_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self, environment: dict, logger: logging.Logger, service_factory: I
self._RUN_FOR_N_DATASETS = (
int(self["run_for_n_datasets"]) if self.get("run_for_n_datasets") is not None else None
)
self._RUN_FOR_SINGLE_REPORTING_ORG = self.get("run_for_single_reporting_org", None)
self._SEND_DATASET_CHECK_MESSAGES = self["SEND_DATASET_CHECK_RESULT_MESSAGES"] == "yes"
self._SKIP_SAFETY = self.get("skip_safety", False)

Expand Down Expand Up @@ -51,6 +52,10 @@ def REDOWNLOAD_FROM_NON_HEAD_SERVERS_AFTER_HOURS(self) -> int:
def RUN_FOR_N_DATASETS(self) -> int | None:
return self._RUN_FOR_N_DATASETS

@property
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not make it a bool? Similar remark on line 23?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes should be a string - it contains the short name of the reporting org.

def RUN_FOR_SINGLE_REPORTING_ORG(self) -> str | None:
return self._RUN_FOR_SINGLE_REPORTING_ORG

@property
def SEND_DATASET_CHECK_MESSAGES(self) -> bool:
return self._SEND_DATASET_CHECK_MESSAGES
Expand Down
21 changes: 19 additions & 2 deletions src/dataset_registration/iati_registry_suitecrm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def fetch_datasets_metadata(

crm.fetch_access_token()

context.logger.info("Fetching all dataset metadata using the libsuitecrm library...")

filters = Filter().equal("iati_visibility", "public")

suitecrm_dataset_records = [r for r in crm.get_all_records("IATI_Datasets", filters=filters)]
Expand Down Expand Up @@ -51,6 +53,8 @@ def fetch_datasets_metadata(
continue

owning_org = reporting_orgs.get(uuid.UUID(record["attributes"]["iati_dataset_owner_org_id"]), None)
if context.RUN_FOR_SINGLE_REPORTING_ORG is not None and owning_org is None:
continue
if owning_org is None:
context.logger.error(
f"SuiteCRM dataset id: {record['id']} has reporting org id: "
Expand All @@ -63,6 +67,8 @@ def fetch_datasets_metadata(
record, owning_org, refresh_timestamp
)

context.logger.info("Fetched metadata for {} datasets".format(len(results)))

return results


Expand All @@ -72,11 +78,22 @@ def fetch_reporting_orgs_metadata(context: BDSContext, refresh_timestamp: dateti

crm.fetch_access_token()

context.logger.info("Fetching all reporting orgs using the libsuitecrm library...")
context.logger.info("Fetching all reporting org metadata using the libsuitecrm library...")

filters = Filter().equal("iati_registry_discoverable", "1")
filters = Filter().equal("iati_registry_discoverable", "1").equal("iati_registry_approved", 1)
suitecrm_reporting_org_records = [r for r in crm.get_all_records("Accounts", filters=filters)]

if context.RUN_FOR_SINGLE_REPORTING_ORG is not None:
suitecrm_reporting_org_records = [
o
for o in suitecrm_reporting_org_records
if o.get("attributes", {}).get("iati_short_name", "") == context.RUN_FOR_SINGLE_REPORTING_ORG
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The type hint is for this to be an integer - should it be a string? It's a string in the command line arguments.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, thanks!

]
context.logger.info(
"--run-for-single-reporting-org is set so only "
f"processing reporting org '{context.RUN_FOR_SINGLE_REPORTING_ORG}'."
)

crm.logout()

results = {}
Expand Down
6 changes: 6 additions & 0 deletions src/iati_bulk_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def main(args: argparse.Namespace):
config = config | {
"single_run": args.single_run,
"run_for_n_datasets": args.run_for_n_datasets,
"run_for_single_reporting_org": args.run_for_single_reporting_org,
"skip_safety": args.skip_safety,
}

Expand Down Expand Up @@ -63,6 +64,11 @@ def main(args: argparse.Namespace):
type=int,
help="Run on the first N datasets from registration service (useful for testing)",
)
parser.add_argument(
"--run-for-single-reporting-org",
type=str,
help="Run only for the datasets belonging to the specified reporting org (useful for testing)",
)
parser.add_argument(
"--skip-safety",
action="store_true",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"iati_org_type": "10",
"iati_hq_country": "GB",
"iati_region": "89",
"iati_registry_approved": "0",
"iati_registry_approved": "1",
"iati_first_publish_date": "",
"iati_data_portal_url": "https://www.example.org/data-portal",
"iati_exclusions_policy_url": "https://www.example.org/exclusions-policy",
Expand Down Expand Up @@ -488,7 +488,7 @@
"iati_org_type": "15",
"iati_hq_country": "GB",
"iati_region": "489",
"iati_registry_approved": "0",
"iati_registry_approved": "1",
"iati_first_publish_date": "",
"iati_data_portal_url": "https://www.example.org/data-portal",
"iati_exclusions_policy_url": "https://www.example.org/exclusions-policy",
Expand Down
1 change: 0 additions & 1 deletion tests/helpers/azure_service_bus_helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
from datetime import datetime
from uuid import UUID

import pytest
Expand Down
7 changes: 4 additions & 3 deletions tests/helpers/data_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import uuid

from config.bds_context import BDSContext
from utilities.azure import get_azure_blob_public_url
from utilities.misc import dataset_has_iati_xml_download, get_object_from_json_str, get_timestamp
from utilities.misc import get_object_from_json_str, get_timestamp


def check_most_recent_get_attempt_http_error(dataset: dict):
Expand Down Expand Up @@ -220,4 +219,6 @@ def expected_values_for_dataset_registration_fields(source_url: str) -> list:

def check_registration_service_refreshed_datetime(data_record: dict):
assert data_record["registration_service_metadata_refreshed_datetime"] is not None
assert data_record["registration_service_metadata_refreshed_datetime"] > (get_timestamp() - datetime.timedelta(minutes=1))
assert data_record["registration_service_metadata_refreshed_datetime"] > (
get_timestamp() - datetime.timedelta(minutes=1)
)
5 changes: 4 additions & 1 deletion tests/integration/test_dataset_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ def test_add_new_undownloadable_dataset(get_and_clear_up_context, source_url, ex


@pytest.mark.parametrize(
"dataset_url,last_known_good_dataset_hash,last_known_good_dataset_hash_excluding_generated_timestamp,last_known_good_dataset_content_length",
(
"dataset_url,last_known_good_dataset_hash,last_known_good_dataset_hash_excluding_generated_timestamp,"
"last_known_good_dataset_content_length"
),
[
(
"http://localhost:3000/data/test_foundation_a-dataset-001.xml",
Expand Down
5 changes: 3 additions & 2 deletions tests/integration/test_dataset_expiry.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ def test_dataset_expiry_after_72_hours_failed_downloads(get_and_clear_up_context
assert get_number_xml_files_in_working_dir(context) == 1

dataset = datasets_in_bds[uuid.UUID("c8a40aa5-9f31-4bcf-a36f-51c1fc2cc159")]
dataset["last_known_good_dataset_downloaded"] = (dataset["last_known_good_dataset_downloaded"]
- timedelta(hours=max_hours + 2))
dataset["last_known_good_dataset_downloaded"] = dataset["last_known_good_dataset_downloaded"] - timedelta(
hours=max_hours + 2
)

context["DATA_REGISTRY_BASE_URL"] = "http://localhost:3000/ckan-registration/datasets-03-1-dataset-404"
checker_run(context, datasets_in_bds)
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_dataset_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from helpers.helpers import download_index_from_azure, get_and_clear_up_context # noqa: F401
from utilities.azure import get_azure_container_name
from utilities.db import get_reporting_orgs_in_bds
from utilities.misc import find_object_by_key, format_timestamp_as_utc_str
from utilities.misc import find_object_by_key


def test_indices_uploaded_to_blob_storage(get_and_clear_up_context): # noqa: F811
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_dataset_registration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dataset_registration.iati_registry_ckan import get_publisher_metadata_as_str
from dataset_registration.registration_proxy import fetch_datasets_metadata, fetch_reporting_orgs_metadata
from helpers.helpers import get_and_clear_up_context # noqa: F401
from utilities.misc import find_object_by_key, get_timestamp
from utilities.misc import get_timestamp


@pytest.mark.parametrize("http_status_code", ["400", "404", "500"])
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_suitecrm_registry_conversion_of_registry_reporting_orgs(get_and_clear_u
assert ro_1["default_licence_id"] == "gpl-3.0"
assert ro_1["description"] == "Eaque eaque nostrum quia illum ipsum."
assert ro_1["exclusions_policy_url"] == "https://www.example.org/exclusions-policy"
assert ro_1["first_publication_date"] == None
assert ro_1["first_publication_date"] is None
assert ro_1["hq_country"] == "GB"
assert ro_1["human_readable_name"] == "Gov Agency 1234"
assert ro_1["organisation_identifier"] == "GOV-AGENCY-AID-1234"
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_dataset_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,4 +362,4 @@ def test_dataset_successful_twice_after_url_change(get_and_clear_up_context): #
# run again
checker_run(context, datasets_in_bds)

check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id])
check_last_known_good_dataset_values_are_set(datasets_in_bds[dataset_id])
4 changes: 2 additions & 2 deletions tests/integration/test_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)


def test_save_reporting_org_db_record(get_and_clear_up_context):
def test_save_reporting_org_db_record(get_and_clear_up_context): # noqa: F811

context = get_and_clear_up_context

Expand Down Expand Up @@ -51,7 +51,7 @@ def test_save_reporting_org_db_record(get_and_clear_up_context):
assert reporting_org_from_db == reporting_org


def test_save_dataset_db_record(get_and_clear_up_context):
def test_save_dataset_db_record(get_and_clear_up_context): # noqa: F811

context = get_and_clear_up_context

Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_mq_registry_dataset_changes.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ async def test_dataset_created_message_01_success(get_and_clear_up_context, serv
check_registration_service_refreshed_datetime(datasets_in_bds[dataset_id])



@pytest.mark.asyncio
async def test_dataset_created_message_02_error_dataset_already_exists(
get_and_clear_up_context, service_bus_context # noqa: F811
Expand Down
Loading
Loading