Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions .github/workflows/codelist-drift.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: OECD codelist drift

# Detects drift between the committed DAC area codelists and the live OECD source.
# Runs OFF the PR path (the live endpoint is an external dependency and must never
# block contributors). On a schedule it refreshes from the live source and, if the
# committed mappings are out of date, opens a pull request with the proposed diff
# for a maintainer to review and merge.

on:
schedule:
- cron: "0 6 * * 1" # Mondays 06:00 UTC
workflow_dispatch: {} # allow manual runs

permissions:
contents: write
pull-requests: write

jobs:
refresh:
name: Refresh and open PR on drift
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- name: Install uv
uses: astral-sh/setup-uv@v8.1.0

- name: Set up Python 3.11
run: uv python install '3.11'

- name: Install dependencies
run: uv sync --frozen --all-groups

- name: Refresh DAC codelists from the live OECD source
run: uv run python -m scripts.data_maintenance.refresh_dac_codelists --write

- name: Open a pull request if the codelists drifted
uses: peter-evans/create-pull-request@v7
with:
branch: chore/refresh-oecd-codelists
base: main
commit-message: "chore: refresh OECD DAC area codelists"
title: "chore: refresh OECD DAC area codelists"
body: |
Automated refresh of the DAC area codelists from the live OECD source
(`development-finance-codelists.oecd.org`).

This PR exists because the committed `dac{1,2}_codes_area.json` are out of
date with the live codelists. Review the diff before merging:

- **Added** codes are new active DAC areas — confirm they look legitimate.
- **Changed** values are codes whose `.stat` (dotstatcode) was updated
upstream — confirm the change is intended (these affect translation output).
- Historical codes are preserved (never deleted), so old data still translates.

Merging runs the normal CI suite. If nothing here looks right, close the PR
and investigate the source rather than merging.
labels: |
maintenance
automated
delete-branch: true
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ convention = "google"

[tool.ruff.lint.per-file-ignores]
# Tests don't need annotations, pylint-magic-value gates, or line-length limits.
"tests/**/*.py" = ["ANN", "PLR2004", "E501"]
# PLC0415: in-test imports are a deliberate pattern for monkeypatching module
# objects (monkeypatch.setattr needs the module, not just the symbol).
"tests/**/*.py" = ["ANN", "PLR2004", "E501", "PLC0415"]
"docs/examples/**/*.py" = ["E501"]
# Notebooks: exploration style — don't require annotations; allow REPL patterns.
"**/*.ipynb" = ["ANN", "F401", "F811", "E402"]
Expand Down Expand Up @@ -162,6 +164,7 @@ testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
pythonpath = ["."]
markers = [
"unit: Fast unit tests (no external dependencies)",
"integration: Tests that call real OECD API",
Expand All @@ -179,7 +182,7 @@ addopts = [
python-version = "3.11"

[tool.ty.src]
include = ["src/oda_reader"]
include = ["src/oda_reader", "scripts"]

[tool.ty.analysis]
# Replace pandas / numpy type information with `Any`. Eliminates the cascade of
Expand Down
Empty file added scripts/__init__.py
Empty file.
Empty file.
50 changes: 50 additions & 0 deletions scripts/data_maintenance/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Shared helpers for data_maintenance scripts."""

from __future__ import annotations

import difflib
import json
import sys


def dumps_canonical(mapping: dict[str, str]) -> str:
"""Render a {str_code: str_value} mapping in canonical form.

Keys that are all-digits are sorted ascending by int value; the
AREA_PASSTHROUGH sentinel ``{"(.*)": "\\\\1"}`` is always appended last
(unconditionally — callers do not need to include it in *mapping*).
Returns a JSON string with indent=2, ensure_ascii=True, trailing newline.
"""
passthrough_key = "(.*)"
digit_items = sorted(
((k, v) for k, v in mapping.items() if k != passthrough_key and k.isdigit()),
key=lambda kv: int(kv[0]),
)
# Non-digit, non-passthrough keys (e.g. price codes "V", "Q") sorted lexically
other_items = sorted(
(k, v) for k, v in mapping.items() if k != passthrough_key and not k.isdigit()
)
# Preserve any explicit passthrough value from mapping; default to "\\1"
passthrough_val = mapping.get(passthrough_key, "\\1")
ordered: dict[str, str] = (
dict(digit_items) | dict(other_items) | {passthrough_key: passthrough_val}
)
return json.dumps(ordered, indent=2, ensure_ascii=True) + "\n"


def emit_json_diff(
original: str,
proposed: str,
*,
fromfile: str,
tofile: str,
) -> None:
"""Write a unified diff of *original* vs *proposed* to stdout."""
sys.stdout.writelines(
difflib.unified_diff(
original.splitlines(keepends=True),
proposed.splitlines(keepends=True),
fromfile=fromfile,
tofile=tofile,
)
)
16 changes: 16 additions & 0 deletions scripts/data_maintenance/_static_overlays.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
"""Hand-maintained, source-less SDMX rules; never scraped.

These constants are version-controlled overlays applied on top of the
area codelists fetched from the OECD codelist app. They must match the
committed mapping files in src/oda_reader/schemas/mappings/ exactly.
"""

# dac1_codes_prices.json
DAC1_PRICES: dict[str, str] = {"V": "A", "Q": "D"}

# dac1_codes_flow_types.json — includes the SDMX regex passthrough rule
DAC1_FLOW_TYPES: dict[str, str] = {"115": "C", "112": "D", "(.*)": "\\1"}

# Appended as the last entry of every rendered area map so that unknown
# codes pass through unchanged in the SDMX mapping engine.
AREA_PASSTHROUGH: dict[str, str] = {"(.*)": "\\1"}
Loading