Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion dataload/import_to_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,10 @@ def maybe_create_index(index_name=ES_INDEX):
},
"GNRecipientOrgCountryName": {
"type": "keyword",
}
},
"GNGeoCategory": {
"type": "keyword",
},
}
},
# Additional funding/recipient organisation mappings
Expand Down Expand Up @@ -724,6 +727,8 @@ def process_grant(grant, grants_file_path):
update_doc_with_other_locations(grant)
# update_doc_with_undetermined needs to go last
update_doc_with_undetermined(grant)
# Categorize geographic status
update_doc_with_geo_category(grant)


def process_grant_file_process(process_queue,
Expand Down Expand Up @@ -1022,6 +1027,41 @@ def update_doc_with_undetermined(grant):
grant["additional_data"][key] = "Undetermined"


def update_doc_with_geo_category(grant):
"""Categorize grants by geographic determination status.
- "UK": Grants with location in the United Kingdom
- "International": Grants with location outside the UK (including when best country is undetermined but beneficiary/recipient has international data)
- "Undetermined": Grants with truly no geographic location data
"""
best_country = grant["additional_data"].get("GNBestCountryName")
beneficiary_country = grant["additional_data"].get("GNBeneficiaryCountryName")
recipient_org_country = grant["additional_data"].get("GNRecipientOrgCountryName")

# Check if any location data indicates international (outside UK)
is_international = False

# Check beneficiary country
if beneficiary_country and beneficiary_country not in ["Undetermined", UNITED_KINGDOM_ISO_NM]:
is_international = True

# Check recipient org country
if recipient_org_country and recipient_org_country not in ["Undetermined", UNITED_KINGDOM_ISO_NM]:
is_international = True

# Check best country
if best_country and best_country not in ["Undetermined", UNITED_KINGDOM_ISO_NM]:
is_international = True

# Categorize based on findings
if is_international:
grant["additional_data"]["GNGeoCategory"] = "International"
elif best_country == "Undetermined" and beneficiary_country == "Undetermined" and recipient_org_country == "Undetermined":
grant["additional_data"]["GNGeoCategory"] = "Undetermined"
else:
# Has some UK data or is clearly UK
grant["additional_data"]["GNGeoCategory"] = "UK"


def update_doc_with_first_recipient_org_info(grant):
grant["additional_data"]["GNRecipientOrgInfo0"] = {}

Expand Down
14 changes: 14 additions & 0 deletions grantnav/frontend/search_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,12 @@ def get_clear_all(request, context, json_query, basic_filter, create_parameters_
context["results"]["clear_all_facet_url"] = request.path + "?" + create_parameters_from_json_query(json_query)


def ensure_filter_list_length(json_query, bool_index):
filters = json_query["query"]["bool"]["filter"]
while len(filters) <= bool_index:
filters.append({"bool": {"should": []}})


def get_terms_facets(
request,
context,
Expand All @@ -233,6 +239,9 @@ def get_terms_facets(
path = request.path

json_query = copy.deepcopy(json_query)

ensure_filter_list_length(json_query, bool_index)

try:
if "must_not" in json_query["query"]["bool"]["filter"][bool_index]["bool"]:
bool_condition = "must_not"
Expand Down Expand Up @@ -318,6 +327,8 @@ def term_facet_from_parameters(request, json_query, field_name, param_name, bool
for value in request.GET.getlist(param_name):
new_filter.append({"term": {field_name: value}})

ensure_filter_list_length(json_query, bool_index)

if request.GET.get("exclude_" + param_name):
json_query["query"]["bool"]["filter"][bool_index]["bool"].pop("should", None)
json_query["query"]["bool"]["filter"][bool_index]["bool"]["must_not"] = new_filter
Expand All @@ -327,6 +338,9 @@ def term_facet_from_parameters(request, json_query, field_name, param_name, bool

def term_parameters_from_json_query(parameters, json_query, field_name, param_name, bool_index, field, is_json=False):
values = []

ensure_filter_list_length(json_query, bool_index)

if "must_not" in json_query["query"]["bool"]["filter"][bool_index]["bool"]:
filters = json_query["query"]["bool"]["filter"][bool_index]["bool"]["must_not"]
must_not = True
Expand Down
2 changes: 2 additions & 0 deletions grantnav/frontend/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
{"bool": {"should": []}}, # 22. additional_data.GNBestCountryName
{"bool": {"should": []}}, # 23. additional_data.GNRecipientOrgCountryName
{"bool": {"should": []}}, # 24. additional_data.GNBeneficiaryCountryName
{"bool": {"should": []}}, # 25. additional_data.GNGeoCategory
]

TermFacet = collections.namedtuple('TermFacet', 'field_name param_name filter_index display_name is_json facet_size')
Expand All @@ -87,6 +88,7 @@
TermFacet("additional_data.GNBestCountryName", "bestCountryName", 22, "Best Available Country", False, 5000),
TermFacet("additional_data.GNRecipientOrgCountryName", "recipientOrgCountryName", 23, "Recipient Country", False, 5000),
TermFacet("additional_data.GNBeneficiaryCountryName", "beneficiaryCountryName", 24, "Grant Location Country", False, 5000),
TermFacet("additional_data.GNGeoCategory", "geoCategory", 25, "Geographic Category", False, 5000),
]

SIZE = 20
Expand Down
3,873 changes: 3,872 additions & 1 deletion tests/data/aggregates_expected.json

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions tests/tests_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,21 @@ def test_aggregates(self):
open(os.path.join(prefix, "aggregates_expected.json"), "r")
)

if res != expected_data:
import difflib
res_str = json.dumps(res, indent=4, sort_keys=True)
exp_str = json.dumps(expected_data, indent=4, sort_keys=True)

diff = difflib.unified_diff(
exp_str.splitlines(),
res_str.splitlines(),
fromfile='expected.json',
tofile='actual.json',
lineterm=''
)

print("\n".join(diff))

self.assertEqual(res, expected_data, "Aggregates do not match. See the diff printed above.")

self.assertEqual(res, expected_data)
37 changes: 37 additions & 0 deletions tests/tests_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dataload.import_to_elasticsearch import (
update_doc_with_other_locations,
update_doc_with_undetermined,
update_doc_with_geo_category,
)

from django.test import TestCase
Expand Down Expand Up @@ -84,3 +85,39 @@ def test_gn_additional_location_fields_uk_grant(self):

for field, val in GN_location_fields:
self.assertEqual(grant_data["additional_data"][field], val)

def test_geo_category_uk_grant(self):
"""Check that UK grants are categorized correctly"""
grant_data = json.load(
open(os.path.join(prefix, "uk_grant_GN_location_fields.json"))
)

update_doc_with_other_locations(grant_data)
update_doc_with_undetermined(grant_data)
update_doc_with_geo_category(grant_data)

self.assertEqual(grant_data["additional_data"]["GNGeoCategory"], "UK")

def test_geo_category_international_grant(self):
"""Check that international grants are categorized correctly"""
grant_data = json.load(
open(os.path.join(prefix, "international_grant_GN_location_fields.json"))
)

update_doc_with_other_locations(grant_data)
update_doc_with_undetermined(grant_data)
update_doc_with_geo_category(grant_data)

self.assertEqual(grant_data["additional_data"]["GNGeoCategory"], "International")

def test_geo_category_undetermined_grant(self):
"""Check that grants with undetermined location are categorized correctly"""
grant_data = {
"id": "test-grant-123",
"additional_data": {},
}

update_doc_with_undetermined(grant_data)
update_doc_with_geo_category(grant_data)

self.assertEqual(grant_data["additional_data"]["GNGeoCategory"], "Undetermined")
34 changes: 29 additions & 5 deletions tests/tests_links.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
import requests
import time
import urllib3

from tests.browser_test_case import BrowserTestCase

Expand All @@ -11,6 +13,9 @@
prefix = os.path.join(os.path.dirname(__file__), "data")


urllib3.disable_warnings()


@tag("link-runner")
@override_settings(
PROVENANCE_JSON=os.path.join(prefix, "data.json"), DISABLE_COOKIE_POPUP=True
Expand Down Expand Up @@ -55,19 +60,35 @@ def check_page_for_broken_links(page):
if link not in skip:
links.append(link)

# Some sites reject connection without a user agent.
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
}

broken = False
for link in links:

if link not in links_checked.keys():
try:
# Some sites reject connection without a user agent
r = requests.head(
link,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36"
},
headers=HEADERS,
verify=False,
)

# If the call gets blocked by a 403 (e.g., by cloudflare) then wait a little bit in
# case we are being throttled then do a fresh request with GET (use stream=True to reduce
# the load). This is introduced because read-the-docs was blocking the standard website.
if r.status_code == 403:
time.sleep(2)
r = requests.Session().get(
link,
headers=HEADERS,
timeout=15,
verify=False,
stream=True,
)

status_code = r.status_code
except Exception as e:
# Set status code to 0 (not a HTTP response code) so it gets displayed along with the other errors at the end.
Expand Down Expand Up @@ -105,7 +126,10 @@ def check_page_for_broken_links(page):
for page in pages_to_find_links:
r = requests.head(f"{self.live_server_url}{page}")
status_code = r.status_code
self.assertFalse((status_code < 200 or status_code > 399), f"{self.live_server_url}{page} error {status_code}")
self.assertFalse(
(status_code < 200 or status_code > 399),
f"{self.live_server_url}{page} error {status_code}",
)

# Test the links on the pages
for page in pages_to_find_links:
Expand Down
Loading