From 2d7b7a4c7d7f47738cae28631d29b31e8e536bc5 Mon Sep 17 00:00:00 2001 From: Ola Rubaj <52197250+olayway@users.noreply.github.com> Date: Tue, 19 May 2026 14:35:37 +0200 Subject: [PATCH 1/3] Improve metadata: add resource/field descriptions and fix year field type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add description to resources "gdp" and "top-economies" - Add description to all fields missing one (country, year, Country Name, Country Code, Year) - Fix top-economies.year type from integer → year (values are four-digit calendar years) - Document in README that top-economies.csv is manually maintained and not regenerated by the automated script Co-Authored-By: Claude Sonnet 4.6 --- README.md | 5 +++++ datapackage.json | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ab9ddd9..c3ce3d2 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,11 @@ World Bank including: [lcu]: http://data.worldbank.org/indicator/NY.GDP.MKTP.KN +## Data notes + +- `data/gdp.csv` is regenerated automatically each month by `scripts/process.py`. +- `data/top-economies.csv` is a manually maintained derived file covering the 10 largest economies for 2000–2022. It is **not** regenerated by the automated script, so its upper year bound may lag behind `gdp.csv`. + ## Preparation Process is recorded and automated in python script: diff --git a/datapackage.json b/datapackage.json index 344c29d..096f935 100644 --- a/datapackage.json +++ b/datapackage.json @@ -59,15 +59,18 @@ "path": "data/top-economies.csv", "format": "csv", "mediatype": "text/csv", + "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2022. Derived from the main gdp resource.", "schema": { "fields": [ { "name": "country", - "type": "string" + "type": "string", + "description": "Country name" }, { "name": "year", - "type": "integer" + "type": "year", + "description": "Year of the GDP observation" }, { "name": "gdp_trillion", @@ -80,19 +83,23 @@ { "name": "gdp", "path": "data/gdp.csv", + "description": "Country, regional and world GDP in current USD. Each row is one country/region for one year. Sourced from the World Bank indicator NY.GDP.MKTP.CD.", "schema": { "fields": [ { "name": "Country Name", - "type": "string" + "type": "string", + "description": "Country or region name as given by the World Bank" }, { "name": "Country Code", - "type": "string" + "type": "string", + "description": "ISO 3166-1 alpha-3 country code or World Bank region code" }, { "name": "Year", - "type": "year" + "type": "year", + "description": "Year of the GDP observation" }, { "description": "GDP in current USD", From 69779a9fc64f3fd490baf142ec682d82cb28573a Mon Sep 17 00:00:00 2001 From: Ola Rubaj <52197250+olayway@users.noreply.github.com> Date: Tue, 19 May 2026 14:47:05 +0200 Subject: [PATCH 2/3] Address license and top-economies automation License: - Change declared license from ODC-PDDL-1.0 to CC-BY-4.0 to match the World Bank upstream source, which publishes under CC BY 4.0 - Add attribution statement to README license section top-economies.csv automation: - Add generate_top_economies() to process.py: reads gdp.csv, filters regional aggregates via WB Metadata_Country file (falls back to a hardcoded exclusion set when running locally without fresh cache), selects top-10 countries by latest-year GDP, writes rows from 2000 onward in USD trillions - Fix output paths to use script_dir-relative paths so the script works correctly when run from scripts/ (as CI does) - update_datapackage() now keeps view title and resource description year range in sync with the generated data - Regenerate data/top-economies.csv: extends coverage from 2022 to 2023 and reorders by 2023 GDP rank (Germany now #3, ahead of Japan) Co-Authored-By: Claude Sonnet 4.6 --- README.md | 4 +- data/top-economies.csv | 102 +++++++++++++++++--------------- datapackage.json | 12 ++-- scripts/process.py | 131 +++++++++++++++++++++++++++++++++++------ 4 files changed, 178 insertions(+), 71 deletions(-) diff --git a/README.md b/README.md index c3ce3d2..36b1eb9 100644 --- a/README.md +++ b/README.md @@ -43,4 +43,6 @@ https://datahub.io/core/gdp ## License -This Data Package is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/ +This dataset is made available under the [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) (CC BY 4.0). + +The underlying data originates from the [World Bank](http://data.worldbank.org/indicator/NY.GDP.MKTP.CD), which publishes its open data under CC BY 4.0. Attribution: World Bank – World Development Indicators. diff --git a/data/top-economies.csv b/data/top-economies.csv index fad332c..713b03d 100644 --- a/data/top-economies.csv +++ b/data/top-economies.csv @@ -22,6 +22,7 @@ United States,2019,21.5214 United States,2020,21.3229 United States,2021,23.594 United States,2022,25.7441 +United States,2023,27.3609 China,2000,1.2113 China,2001,1.3394 China,2002,1.4706 @@ -45,29 +46,7 @@ China,2019,14.28 China,2020,14.6877 China,2021,17.8205 China,2022,17.8818 -Japan,2000,4.9684 -Japan,2001,4.3747 -Japan,2002,4.1828 -Japan,2003,4.5196 -Japan,2004,4.8931 -Japan,2005,4.8315 -Japan,2006,4.6017 -Japan,2007,4.5798 -Japan,2008,5.1067 -Japan,2009,5.2895 -Japan,2010,5.7591 -Japan,2011,6.2331 -Japan,2012,6.2724 -Japan,2013,5.2123 -Japan,2014,4.897 -Japan,2015,4.4449 -Japan,2016,5.0037 -Japan,2017,4.9308 -Japan,2018,5.0409 -Japan,2019,5.118 -Japan,2020,5.0556 -Japan,2021,5.0346 -Japan,2022,4.2564 +China,2023,17.7948 Germany,2000,1.948 Germany,2001,1.9458 Germany,2002,2.0785 @@ -91,6 +70,31 @@ Germany,2019,3.8892 Germany,2020,3.8877 Germany,2021,4.2785 Germany,2022,4.0825 +Germany,2023,4.4561 +Japan,2000,4.9684 +Japan,2001,4.3747 +Japan,2002,4.1828 +Japan,2003,4.5196 +Japan,2004,4.8931 +Japan,2005,4.8315 +Japan,2006,4.6017 +Japan,2007,4.5798 +Japan,2008,5.1067 +Japan,2009,5.2895 +Japan,2010,5.7591 +Japan,2011,6.2331 +Japan,2012,6.2724 +Japan,2013,5.2123 +Japan,2014,4.897 +Japan,2015,4.4449 +Japan,2016,5.0037 +Japan,2017,4.9308 +Japan,2018,5.0409 +Japan,2019,5.118 +Japan,2020,5.0556 +Japan,2021,5.0346 +Japan,2022,4.2564 +Japan,2023,4.2129 India,2000,0.4684 India,2001,0.4854 India,2002,0.5149 @@ -114,6 +118,7 @@ India,2019,2.8356 India,2020,2.6749 India,2021,3.1673 India,2022,3.3535 +India,2023,3.5499 United Kingdom,2000,1.6655 United Kingdom,2001,1.6498 United Kingdom,2002,1.7857 @@ -137,6 +142,7 @@ United Kingdom,2019,2.8514 United Kingdom,2020,2.6978 United Kingdom,2021,3.1415 United Kingdom,2022,3.0888 +United Kingdom,2023,3.34 France,2000,1.3656 France,2001,1.3777 France,2002,1.5014 @@ -160,29 +166,7 @@ France,2019,2.7289 France,2020,2.6474 France,2021,2.9594 France,2022,2.7791 -Canada,2000,0.7448 -Canada,2001,0.739 -Canada,2002,0.7606 -Canada,2003,0.8955 -Canada,2004,1.0267 -Canada,2005,1.1731 -Canada,2006,1.3193 -Canada,2007,1.4688 -Canada,2008,1.553 -Canada,2009,1.3746 -Canada,2010,1.6173 -Canada,2011,1.7933 -Canada,2012,1.8284 -Canada,2013,1.8466 -Canada,2014,1.8057 -Canada,2015,1.5565 -Canada,2016,1.528 -Canada,2017,1.6493 -Canada,2018,1.7253 -Canada,2019,1.7437 -Canada,2020,1.6557 -Canada,2021,2.0075 -Canada,2022,2.1615 +France,2023,3.0309 Italy,2000,1.1467 Italy,2001,1.168 Italy,2002,1.2768 @@ -206,6 +190,7 @@ Italy,2019,2.0113 Italy,2020,1.8975 Italy,2021,2.1549 Italy,2022,2.067 +Italy,2023,2.2549 Brazil,2000,0.6554 Brazil,2001,0.56 Brazil,2002,0.5098 @@ -229,3 +214,28 @@ Brazil,2019,1.8733 Brazil,2020,1.4761 Brazil,2021,1.6706 Brazil,2022,1.9519 +Brazil,2023,2.1737 +Canada,2000,0.7448 +Canada,2001,0.739 +Canada,2002,0.7606 +Canada,2003,0.8955 +Canada,2004,1.0267 +Canada,2005,1.1731 +Canada,2006,1.3193 +Canada,2007,1.4688 +Canada,2008,1.553 +Canada,2009,1.3746 +Canada,2010,1.6173 +Canada,2011,1.7933 +Canada,2012,1.8284 +Canada,2013,1.8466 +Canada,2014,1.8057 +Canada,2015,1.5565 +Canada,2016,1.528 +Canada,2017,1.6493 +Canada,2018,1.7253 +Canada,2019,1.7437 +Canada,2020,1.6557 +Canada,2021,2.0075 +Canada,2022,2.1615 +Canada,2023,2.1401 diff --git a/datapackage.json b/datapackage.json index 096f935..84a095f 100644 --- a/datapackage.json +++ b/datapackage.json @@ -10,17 +10,17 @@ "last_updated": "2026-04-08", "licenses": [ { - "name": "ODC-PDDL-1.0", - "path": "http://opendatacommons.org/licenses/pddl/", - "title": "Open Data Commons Public Domain Dedication and License v1.0" + "name": "CC-BY-4.0", + "path": "https://creativecommons.org/licenses/by/4.0/", + "title": "Creative Commons Attribution 4.0" } ], "name": "gdp", "views": [ { "name": "top-economies", - "title": "GDP of the World's 10 Largest Economies (2000\u20132022)", - "description": "GDP in current USD for the top 10 economies since 2000. The US held a commanding lead throughout, while China's rapid ascent \u2014 from $1.2T in 2000 to $17.9T in 2022 \u2014 is the defining economic shift of the century. India overtook the UK and France in the early 2020s.", + "title": "GDP of the World's 10 Largest Economies (2000\u20132023)", + "description": "GDP in current USD for the top 10 economies since 2000. The US has held a commanding lead throughout, while China's rapid ascent is the defining economic shift of the century. India overtook the UK and France in the early 2020s.", "resources": [ "top-economies" ], @@ -59,7 +59,7 @@ "path": "data/top-economies.csv", "format": "csv", "mediatype": "text/csv", - "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2022. Derived from the main gdp resource.", + "description": "GDP in current USD trillions for the world's 10 largest economies, covering 2000–2023. Derived from the main gdp resource.", "schema": { "fields": [ { diff --git a/scripts/process.py b/scripts/process.py index 42b87ed..1bdb333 100644 --- a/scripts/process.py +++ b/scripts/process.py @@ -5,18 +5,30 @@ import zipfile import requests -from datetime import datetime, timedelta +from datetime import datetime cache = 'cache' -data = 'data/gdp.csv' script_dir = os.path.dirname(os.path.abspath(__file__)) +gdp_path = os.path.join(script_dir, '..', 'data', 'gdp.csv') +top_economies_path = os.path.join(script_dir, '..', 'data', 'top-economies.csv') +datapackage_path = os.path.join(script_dir, '..', 'datapackage.json') url = 'https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=csv' outheadings = ['Country Name', 'Country Code', 'Year', 'Value'] current_year = datetime.now().year -datapackage = '../datapackage.json' + +# World Bank codes that are regional/income-group aggregates, not individual countries. +# Used as a fallback filter when the Metadata_Country file is unavailable. +WB_AGGREGATE_CODES = { + 'AFE', 'AFW', 'ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS', + 'EMU', 'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX', + 'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 'MIC', 'MNA', + 'NAC', 'OEC', 'OED', 'OSS', 'PRE', 'PSS', 'PST', 'SAS', 'SSA', 'SSF', 'SST', + 'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS', 'UMC', 'WLD', +} + def search_files_in_cache(): - cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache') + cache_dir = os.path.join(script_dir, cache) if not os.path.exists(cache_dir): print("Cache folder does not exist!") @@ -31,6 +43,76 @@ def search_files_in_cache(): return filtered_files +def get_country_codes_from_metadata(): + """Return the set of individual-country codes from the WB Metadata_Country file. + + Countries have a non-blank Region field; aggregates do not. + Falls back to reading gdp.csv and excluding known aggregate codes if the + metadata file is absent (e.g. when running locally without a fresh cache). + Returns None only if neither source is available. + """ + cache_dir = os.path.join(script_dir, cache) + if os.path.exists(cache_dir): + metadata_files = [f for f in os.listdir(cache_dir) if 'Metadata_Country' in f] + if metadata_files: + meta_path = os.path.join(cache_dir, metadata_files[0]) + country_codes = set() + with open(meta_path, 'r') as f: + for row in csv.DictReader(f): + if row.get('Region', '').strip(): + country_codes.add(row['Country Code']) + return country_codes + + # Fallback: derive from gdp.csv by excluding known aggregate codes + if os.path.exists(gdp_path): + all_codes = set() + with open(gdp_path, 'r') as f: + for row in csv.DictReader(f): + all_codes.add(row['Country Code']) + return all_codes - WB_AGGREGATE_CODES + + return None + + +def generate_top_economies(gdp_csv, top_economies_csv, start_year=2000, n=10): + """Derive top-economies.csv from gdp.csv. + + Selects the n countries with the highest GDP in the latest available year, + then writes all their rows from start_year onward with GDP in USD trillions. + Returns (start_year, latest_year). + """ + country_codes = get_country_codes_from_metadata() + + country_data = {} + with open(gdp_csv, 'r') as f: + for row in csv.DictReader(f): + if country_codes is not None and row['Country Code'] not in country_codes: + continue + country = row['Country Name'] + year = int(row['Year']) + country_data.setdefault(country, {})[year] = float(row['Value']) + + latest_year = max(y for d in country_data.values() for y in d) + top_countries = sorted( + (c for c in country_data if latest_year in country_data[c]), + key=lambda c: country_data[c][latest_year], + reverse=True + )[:n] + + rows = [ + [country, year, round(country_data[country][year] / 1e12, 4)] + for country in top_countries + for year in sorted(y for y in country_data[country] if y >= start_year) + ] + + with open(top_economies_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['country', 'year', 'gdp_trillion']) + writer.writerows(rows) + + return start_year, latest_year + + def transform_csv(dest): with open(dest, 'r') as f: reader = csv.reader(f) @@ -40,12 +122,12 @@ def transform_csv(dest): last_updated_date = updated_date_row[1].split("-") last_updated_date = f"{last_updated_date[0]}-{last_updated_date[1]}-{last_updated_date[2]}" - next(reader) + next(reader) header = next(reader) transformed_data = [] for row in reader: - country_name = row[0] - country_code = row[1] + country_name = row[0] + country_code = row[1] for i in range(4, len(row)): year = header[i] @@ -53,19 +135,32 @@ def transform_csv(dest): if value: transformed_data.append([country_name, country_code, year, value]) - + return last_updated_date, transformed_data -def update_datapackage(last_updated): - with open(datapackage, 'r') as f: + +def update_datapackage(last_updated, top_start_year, top_end_year): + with open(datapackage_path, 'r') as f: dp = json.load(f) dp['last_updated'] = str(last_updated) dp['version'] = str(current_year) - with open(datapackage, 'w') as f: + year_range = f"{top_start_year}–{top_end_year}" + for resource in dp['resources']: + if resource['name'] == 'top-economies': + resource['description'] = ( + f"GDP in current USD trillions for the world's 10 largest economies, " + f"covering {year_range}. Derived from the main gdp resource." + ) + for view in dp.get('views', []): + if view.get('name') == 'top-economies': + view['title'] = f"GDP of the World's 10 Largest Economies ({year_range})" + + with open(datapackage_path, 'w') as f: json.dump(dp, f, indent=2) + def extract_zip(): cache_dir = os.path.join(script_dir, cache) @@ -83,21 +178,21 @@ def extract_zip(): def process(): - extract_zip() # Ensure ZIP extraction happens first + extract_zip() file_name = search_files_in_cache()[0] dest = os.path.join(script_dir, cache, file_name) last_updated, transformed_data = transform_csv(dest) - output_dir = os.path.dirname('data/') - if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(os.path.dirname(gdp_path), exist_ok=True) - with open(os.path.abspath(data), 'w', newline='') as outfile: + with open(gdp_path, 'w', newline='') as outfile: writer = csv.writer(outfile) writer.writerow(outheadings) writer.writerows(transformed_data) - update_datapackage(last_updated) - + start_year, end_year = generate_top_economies(gdp_path, top_economies_path) + update_datapackage(last_updated, start_year, end_year) + + if __name__ == '__main__': process() From ecf7fa3c1d53c04dcaa97d2b60008f873de32032 Mon Sep 17 00:00:00 2001 From: Ola Rubaj <52197250+olayway@users.noreply.github.com> Date: Tue, 19 May 2026 14:57:00 +0200 Subject: [PATCH 3/3] Fix stale Data notes: top-economies.csv is now automated Co-Authored-By: Claude Sonnet 4.6 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 36b1eb9..19ed538 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ World Bank including: ## Data notes - `data/gdp.csv` is regenerated automatically each month by `scripts/process.py`. -- `data/top-economies.csv` is a manually maintained derived file covering the 10 largest economies for 2000–2022. It is **not** regenerated by the automated script, so its upper year bound may lag behind `gdp.csv`. +- `data/top-economies.csv` is derived from `gdp.csv` by the same script: it covers the 10 largest economies (by latest-year GDP) from 2000 onward, with values in USD trillions. ## Preparation