From 2d7b7a4c7d7f47738cae28631d29b31e8e536bc5 Mon Sep 17 00:00:00 2001
From: Ola Rubaj <52197250+olayway@users.noreply.github.com>
Date: Tue, 19 May 2026 14:35:37 +0200
Subject: [PATCH 1/3] Improve metadata: add resource/field descriptions and fix
 year field type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add description to resources "gdp" and "top-economies"
- Add description to all fields missing one (country, year, Country Name, Country Code, Year)
- Fix top-economies.year type from integer → year (values are four-digit calendar years)
- Document in README that top-economies.csv is manually maintained and not regenerated by the automated script

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md        |  5 +++++
 datapackage.json | 17 ++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ab9ddd9..c3ce3d2 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,11 @@ World Bank including:
 [lcu]: http://data.worldbank.org/indicator/NY.GDP.MKTP.KN
 
 
+## Data notes
+
+- `data/gdp.csv` is regenerated automatically each month by `scripts/process.py`.
+- `data/top-economies.csv` is a manually maintained derived file covering the 10 largest economies for 2000–2022. It is **not** regenerated by the automated script, so its upper year bound may lag behind `gdp.csv`.
+
 ## Preparation
 
 Process is recorded and automated in python script:
diff --git a/datapackage.json b/datapackage.json
index 344c29d..096f935 100644
--- a/datapackage.json
+++ b/datapackage.json
@@ -59,15 +59,18 @@
       "path": "data/top-economies.csv",
       "format": "csv",
       "mediatype": "text/csv",
+      "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2022. Derived from the main gdp resource.",
       "schema": {
         "fields": [
           {
             "name": "country",
-            "type": "string"
+            "type": "string",
+            "description": "Country name"
           },
           {
             "name": "year",
-            "type": "integer"
+            "type": "year",
+            "description": "Year of the GDP observation"
           },
           {
             "name": "gdp_trillion",
@@ -80,19 +83,23 @@
     {
       "name": "gdp",
       "path": "data/gdp.csv",
+      "description": "Country, regional and world GDP in current USD. Each row is one country/region for one year. Sourced from the World Bank indicator NY.GDP.MKTP.CD.",
       "schema": {
         "fields": [
           {
             "name": "Country Name",
-            "type": "string"
+            "type": "string",
+            "description": "Country or region name as given by the World Bank"
           },
           {
             "name": "Country Code",
-            "type": "string"
+            "type": "string",
+            "description": "ISO 3166-1 alpha-3 country code or World Bank region code"
           },
           {
             "name": "Year",
-            "type": "year"
+            "type": "year",
+            "description": "Year of the GDP observation"
           },
           {
             "description": "GDP in current USD",

From 69779a9fc64f3fd490baf142ec682d82cb28573a Mon Sep 17 00:00:00 2001
From: Ola Rubaj <52197250+olayway@users.noreply.github.com>
Date: Tue, 19 May 2026 14:47:05 +0200
Subject: [PATCH 2/3] Address license and top-economies automation

License:
- Change declared license from ODC-PDDL-1.0 to CC-BY-4.0 to match the
  World Bank upstream source, which publishes under CC BY 4.0
- Add attribution statement to README license section

top-economies.csv automation:
- Add generate_top_economies() to process.py: reads gdp.csv, filters
  regional aggregates via WB Metadata_Country file (falls back to a
  hardcoded exclusion set when running locally without fresh cache),
  selects top-10 countries by latest-year GDP, writes rows from 2000
  onward in USD trillions
- Fix output paths to use script_dir-relative paths so the script works
  correctly when run from scripts/ (as CI does)
- update_datapackage() now keeps view title and resource description year
  range in sync with the generated data
- Regenerate data/top-economies.csv: extends coverage from 2022 to 2023
  and reorders by 2023 GDP rank (Germany now #3, ahead of Japan)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md              |   4 +-
 data/top-economies.csv | 102 +++++++++++++++++---------------
 datapackage.json       |  12 ++--
 scripts/process.py     | 131 +++++++++++++++++++++++++++++++++++------
 4 files changed, 178 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index c3ce3d2..36b1eb9 100644
--- a/README.md
+++ b/README.md
@@ -43,4 +43,6 @@ https://datahub.io/core/gdp
 
 ## License
 
-This Data Package is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/
+This dataset is made available under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0).
+
+The underlying data originates from the [World Bank](http://data.worldbank.org/indicator/NY.GDP.MKTP.CD), which publishes its open data under CC BY 4.0. Attribution: World Bank – World Development Indicators.
diff --git a/data/top-economies.csv b/data/top-economies.csv
index fad332c..713b03d 100644
--- a/data/top-economies.csv
+++ b/data/top-economies.csv
@@ -22,6 +22,7 @@ United States,2019,21.5214
 United States,2020,21.3229
 United States,2021,23.594
 United States,2022,25.7441
+United States,2023,27.3609
 China,2000,1.2113
 China,2001,1.3394
 China,2002,1.4706
@@ -45,29 +46,7 @@ China,2019,14.28
 China,2020,14.6877
 China,2021,17.8205
 China,2022,17.8818
-Japan,2000,4.9684
-Japan,2001,4.3747
-Japan,2002,4.1828
-Japan,2003,4.5196
-Japan,2004,4.8931
-Japan,2005,4.8315
-Japan,2006,4.6017
-Japan,2007,4.5798
-Japan,2008,5.1067
-Japan,2009,5.2895
-Japan,2010,5.7591
-Japan,2011,6.2331
-Japan,2012,6.2724
-Japan,2013,5.2123
-Japan,2014,4.897
-Japan,2015,4.4449
-Japan,2016,5.0037
-Japan,2017,4.9308
-Japan,2018,5.0409
-Japan,2019,5.118
-Japan,2020,5.0556
-Japan,2021,5.0346
-Japan,2022,4.2564
+China,2023,17.7948
 Germany,2000,1.948
 Germany,2001,1.9458
 Germany,2002,2.0785
@@ -91,6 +70,31 @@ Germany,2019,3.8892
 Germany,2020,3.8877
 Germany,2021,4.2785
 Germany,2022,4.0825
+Germany,2023,4.4561
+Japan,2000,4.9684
+Japan,2001,4.3747
+Japan,2002,4.1828
+Japan,2003,4.5196
+Japan,2004,4.8931
+Japan,2005,4.8315
+Japan,2006,4.6017
+Japan,2007,4.5798
+Japan,2008,5.1067
+Japan,2009,5.2895
+Japan,2010,5.7591
+Japan,2011,6.2331
+Japan,2012,6.2724
+Japan,2013,5.2123
+Japan,2014,4.897
+Japan,2015,4.4449
+Japan,2016,5.0037
+Japan,2017,4.9308
+Japan,2018,5.0409
+Japan,2019,5.118
+Japan,2020,5.0556
+Japan,2021,5.0346
+Japan,2022,4.2564
+Japan,2023,4.2129
 India,2000,0.4684
 India,2001,0.4854
 India,2002,0.5149
@@ -114,6 +118,7 @@ India,2019,2.8356
 India,2020,2.6749
 India,2021,3.1673
 India,2022,3.3535
+India,2023,3.5499
 United Kingdom,2000,1.6655
 United Kingdom,2001,1.6498
 United Kingdom,2002,1.7857
@@ -137,6 +142,7 @@ United Kingdom,2019,2.8514
 United Kingdom,2020,2.6978
 United Kingdom,2021,3.1415
 United Kingdom,2022,3.0888
+United Kingdom,2023,3.34
 France,2000,1.3656
 France,2001,1.3777
 France,2002,1.5014
@@ -160,29 +166,7 @@ France,2019,2.7289
 France,2020,2.6474
 France,2021,2.9594
 France,2022,2.7791
-Canada,2000,0.7448
-Canada,2001,0.739
-Canada,2002,0.7606
-Canada,2003,0.8955
-Canada,2004,1.0267
-Canada,2005,1.1731
-Canada,2006,1.3193
-Canada,2007,1.4688
-Canada,2008,1.553
-Canada,2009,1.3746
-Canada,2010,1.6173
-Canada,2011,1.7933
-Canada,2012,1.8284
-Canada,2013,1.8466
-Canada,2014,1.8057
-Canada,2015,1.5565
-Canada,2016,1.528
-Canada,2017,1.6493
-Canada,2018,1.7253
-Canada,2019,1.7437
-Canada,2020,1.6557
-Canada,2021,2.0075
-Canada,2022,2.1615
+France,2023,3.0309
 Italy,2000,1.1467
 Italy,2001,1.168
 Italy,2002,1.2768
@@ -206,6 +190,7 @@ Italy,2019,2.0113
 Italy,2020,1.8975
 Italy,2021,2.1549
 Italy,2022,2.067
+Italy,2023,2.2549
 Brazil,2000,0.6554
 Brazil,2001,0.56
 Brazil,2002,0.5098
@@ -229,3 +214,28 @@ Brazil,2019,1.8733
 Brazil,2020,1.4761
 Brazil,2021,1.6706
 Brazil,2022,1.9519
+Brazil,2023,2.1737
+Canada,2000,0.7448
+Canada,2001,0.739
+Canada,2002,0.7606
+Canada,2003,0.8955
+Canada,2004,1.0267
+Canada,2005,1.1731
+Canada,2006,1.3193
+Canada,2007,1.4688
+Canada,2008,1.553
+Canada,2009,1.3746
+Canada,2010,1.6173
+Canada,2011,1.7933
+Canada,2012,1.8284
+Canada,2013,1.8466
+Canada,2014,1.8057
+Canada,2015,1.5565
+Canada,2016,1.528
+Canada,2017,1.6493
+Canada,2018,1.7253
+Canada,2019,1.7437
+Canada,2020,1.6557
+Canada,2021,2.0075
+Canada,2022,2.1615
+Canada,2023,2.1401
diff --git a/datapackage.json b/datapackage.json
index 096f935..84a095f 100644
--- a/datapackage.json
+++ b/datapackage.json
@@ -10,17 +10,17 @@
   "last_updated": "2026-04-08",
   "licenses": [
     {
-      "name": "ODC-PDDL-1.0",
-      "path": "http://opendatacommons.org/licenses/pddl/",
-      "title": "Open Data Commons Public Domain Dedication and License v1.0"
+      "name": "CC-BY-4.0",
+      "path": "https://creativecommons.org/licenses/by/4.0/",
+      "title": "Creative Commons Attribution 4.0"
     }
   ],
   "name": "gdp",
   "views": [
     {
       "name": "top-economies",
-      "title": "GDP of the World's 10 Largest Economies (2000\u20132022)",
-      "description": "GDP in current USD for the top 10 economies since 2000. The US held a commanding lead throughout, while China's rapid ascent \u2014 from $1.2T in 2000 to $17.9T in 2022 \u2014 is the defining economic shift of the century. India overtook the UK and France in the early 2020s.",
+      "title": "GDP of the World's 10 Largest Economies (2000\u20132023)",
+      "description": "GDP in current USD for the top 10 economies since 2000. The US has held a commanding lead throughout, while China's rapid ascent is the defining economic shift of the century. India overtook the UK and France in the early 2020s.",
       "resources": [
         "top-economies"
       ],
@@ -59,7 +59,7 @@
       "path": "data/top-economies.csv",
       "format": "csv",
       "mediatype": "text/csv",
-      "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2022. Derived from the main gdp resource.",
+      "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2023. Derived from the main gdp resource.",
       "schema": {
         "fields": [
           {
diff --git a/scripts/process.py b/scripts/process.py
index 42b87ed..1bdb333 100644
--- a/scripts/process.py
+++ b/scripts/process.py
@@ -5,18 +5,30 @@
 import zipfile
 import requests
 
-from datetime import datetime, timedelta
+from datetime import datetime
 
 cache = 'cache'
-data = 'data/gdp.csv'
 script_dir = os.path.dirname(os.path.abspath(__file__))
+gdp_path = os.path.join(script_dir, '..', 'data', 'gdp.csv')
+top_economies_path = os.path.join(script_dir, '..', 'data', 'top-economies.csv')
+datapackage_path = os.path.join(script_dir, '..', 'datapackage.json')
 url = 'https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=csv'
 outheadings = ['Country Name', 'Country Code', 'Year', 'Value']
 current_year = datetime.now().year
-datapackage = '../datapackage.json'
+
+# World Bank codes that are regional/income-group aggregates, not individual countries.
+# Used as a fallback filter when the Metadata_Country file is unavailable.
+WB_AGGREGATE_CODES = {
+    'AFE', 'AFW', 'ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS',
+    'EMU', 'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX',
+    'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA',
+    'NAC', 'OEC', 'OED', 'OSS', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST',
+    'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD',
+}
+
 
 def search_files_in_cache():
-    cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache')
+    cache_dir = os.path.join(script_dir, cache)
 
     if not os.path.exists(cache_dir):
         print("Cache folder does not exist!")
@@ -31,6 +43,76 @@ def search_files_in_cache():
     return filtered_files
 
 
+def get_country_codes_from_metadata():
+    """Return the set of individual-country codes from the WB Metadata_Country file.
+
+    Countries have a non-blank Region field; aggregates do not.
+    Falls back to reading gdp.csv and excluding known aggregate codes if the
+    metadata file is absent (e.g. when running locally without a fresh cache).
+    Returns None only if neither source is available.
+    """
+    cache_dir = os.path.join(script_dir, cache)
+    if os.path.exists(cache_dir):
+        metadata_files = [f for f in os.listdir(cache_dir) if 'Metadata_Country' in f]
+        if metadata_files:
+            meta_path = os.path.join(cache_dir, metadata_files[0])
+            country_codes = set()
+            with open(meta_path, 'r') as f:
+                for row in csv.DictReader(f):
+                    if row.get('Region', '').strip():
+                        country_codes.add(row['Country Code'])
+            return country_codes
+
+    # Fallback: derive from gdp.csv by excluding known aggregate codes
+    if os.path.exists(gdp_path):
+        all_codes = set()
+        with open(gdp_path, 'r') as f:
+            for row in csv.DictReader(f):
+                all_codes.add(row['Country Code'])
+        return all_codes - WB_AGGREGATE_CODES
+
+    return None
+
+
+def generate_top_economies(gdp_csv, top_economies_csv, start_year=2000, n=10):
+    """Derive top-economies.csv from gdp.csv.
+
+    Selects the n countries with the highest GDP in the latest available year,
+    then writes all their rows from start_year onward with GDP in USD trillions.
+    Returns (start_year, latest_year).
+    """
+    country_codes = get_country_codes_from_metadata()
+
+    country_data = {}
+    with open(gdp_csv, 'r') as f:
+        for row in csv.DictReader(f):
+            if country_codes is not None and row['Country Code'] not in country_codes:
+                continue
+            country = row['Country Name']
+            year = int(row['Year'])
+            country_data.setdefault(country, {})[year] = float(row['Value'])
+
+    latest_year = max(y for d in country_data.values() for y in d)
+    top_countries = sorted(
+        (c for c in country_data if latest_year in country_data[c]),
+        key=lambda c: country_data[c][latest_year],
+        reverse=True
+    )[:n]
+
+    rows = [
+        [country, year, round(country_data[country][year] / 1e12, 4)]
+        for country in top_countries
+        for year in sorted(y for y in country_data[country] if y >= start_year)
+    ]
+
+    with open(top_economies_csv, 'w', newline='') as f:
+        writer = csv.writer(f)
+        writer.writerow(['country', 'year', 'gdp_trillion'])
+        writer.writerows(rows)
+
+    return start_year, latest_year
+
+
 def transform_csv(dest):
     with open(dest, 'r') as f:
         reader = csv.reader(f)
@@ -40,12 +122,12 @@ def transform_csv(dest):
         last_updated_date = updated_date_row[1].split("-")
         last_updated_date = f"{last_updated_date[0]}-{last_updated_date[1]}-{last_updated_date[2]}"
 
-        next(reader) 
+        next(reader)
         header = next(reader)
         transformed_data = []
         for row in reader:
-            country_name = row[0]  
-            country_code = row[1]  
+            country_name = row[0]
+            country_code = row[1]
 
             for i in range(4, len(row)):
                 year = header[i]
@@ -53,19 +135,32 @@ def transform_csv(dest):
 
                 if value:
                     transformed_data.append([country_name, country_code, year, value])
-    
+
     return last_updated_date, transformed_data
 
-def update_datapackage(last_updated):
-    with open(datapackage, 'r') as f:
+
+def update_datapackage(last_updated, top_start_year, top_end_year):
+    with open(datapackage_path, 'r') as f:
         dp = json.load(f)
 
     dp['last_updated'] = str(last_updated)
     dp['version'] = str(current_year)
 
-    with open(datapackage, 'w') as f:
+    year_range = f"{top_start_year}–{top_end_year}"
+    for resource in dp['resources']:
+        if resource['name'] == 'top-economies':
+            resource['description'] = (
+                f"GDP in current USD trillions for the world's 10 largest economies, "
+                f"covering {year_range}. Derived from the main gdp resource."
+            )
+    for view in dp.get('views', []):
+        if view.get('name') == 'top-economies':
+            view['title'] = f"GDP of the World's 10 Largest Economies ({year_range})"
+
+    with open(datapackage_path, 'w') as f:
         json.dump(dp, f, indent=2)
 
+
 def extract_zip():
     cache_dir = os.path.join(script_dir, cache)
 
@@ -83,21 +178,21 @@ def extract_zip():
 
 
 def process():
-    extract_zip()  # Ensure ZIP extraction happens first
+    extract_zip()
     file_name = search_files_in_cache()[0]
     dest = os.path.join(script_dir, cache, file_name)
     last_updated, transformed_data = transform_csv(dest)
 
-    output_dir = os.path.dirname('data/')
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
+    os.makedirs(os.path.dirname(gdp_path), exist_ok=True)
 
-    with open(os.path.abspath(data), 'w', newline='') as outfile:
+    with open(gdp_path, 'w', newline='') as outfile:
         writer = csv.writer(outfile)
         writer.writerow(outheadings)
         writer.writerows(transformed_data)
 
-    update_datapackage(last_updated)
-    
+    start_year, end_year = generate_top_economies(gdp_path, top_economies_path)
+    update_datapackage(last_updated, start_year, end_year)
+
+
 if __name__ == '__main__':
     process()

From ecf7fa3c1d53c04dcaa97d2b60008f873de32032 Mon Sep 17 00:00:00 2001
From: Ola Rubaj <52197250+olayway@users.noreply.github.com>
Date: Tue, 19 May 2026 14:57:00 +0200
Subject: [PATCH 3/3] Fix stale Data notes: top-economies.csv is now automated

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 36b1eb9..19ed538 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ World Bank including:
 ## Data notes
 
 - `data/gdp.csv` is regenerated automatically each month by `scripts/process.py`.
-- `data/top-economies.csv` is a manually maintained derived file covering the 10 largest economies for 2000–2022. It is **not** regenerated by the automated script, so its upper year bound may lag behind `gdp.csv`.
+- `data/top-economies.csv` is derived from `gdp.csv` by the same script: it covers the 10 largest economies (by latest-year GDP) from 2000 onward, with values in USD trillions.
 
 ## Preparation