-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconfig.yaml
More file actions
277 lines (271 loc) · 10.9 KB
/
config.yaml
File metadata and controls
277 lines (271 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# Versioned directories (used with config.get_dir_path())
versions:
osm_data: "20260416"
model_output: "20260422_by_shared_label"
snapshot_osm: "20260417"
snapshot_overture: "20260423"
conflation: "20260423"
source_coop: "2026-04-23-v0" # Source Cooperative upload folder (YYYY-MM-DD-v<IDX>); bump v<IDX> only for same-day re-uploads
# Settings for downloading data
download:
general:
timeout: 1_000
# Census 1:20M cartographic state boundary file; includes 50 states + DC
# + PR. Used by all three snapshot downloads to restrict POIs to the US
# plus Puerto Rico. The coastline buffer expands the dissolved polygon
# outward by N metres so near-shore POIs are retained; internal state
# borders disappear on dissolve so the buffer only affects the coast.
boundary:
source_url: "https://www2.census.gov/geo/tiger/GENZ2023/shp/cb_2023_us_state_20m.zip"
coastline_buffer_m: 100
osm:
start_date: 2016-01-01
end_date: 2025-12-31
pbf_url: "https://download.geofabrik.de/north-america/us-latest.osm.pbf"
pr_pbf_url: "https://download.geofabrik.de/north-america/us/puerto-rico-latest.osm.pbf"
# Full-history PBFs live on Geofabrik's OAuth-protected internal server.
# Any OSM account grants access; generate a Netscape-format cookie jar by
# logging in at https://osm-internal.download.geofabrik.de/ and exporting
# cookies, or by running Geofabrik's oauth_cookie_client.py.
history_pbf_url: "https://osm-internal.download.geofabrik.de/north-america/us-internal.osh.pbf"
pr_history_pbf_url: "https://osm-internal.download.geofabrik.de/north-america/us/puerto-rico-internal.osh.pbf"
history_cookie_file: "~/data/openpois/.creds/geofabrik_cookies.txt"
overwrite_download: true
overwrite_filter: true
overwrite_parse: true
source_label: "osm"
keep_all_keys: false
chunk_size: 100_000
max_area_nodes: 1_000
verbose: true
filter_keys: ['shop', 'healthcare', 'leisure', 'amenity', 'tourism', 'office', 'craft', 'historic']
extract_keys: [
'addr:city','addr:country','addr:housename','addr:housenumber','addr:postcode',
'addr:state','addr:street','addr:unit','amenity','atm','bar','bicycle',
'bicycle_parking','brand','brand:wikidata','building','check_date',
'check_date:opening_hours','craft','cuisine','description','education','email',
'emergency','fountain','geometry','healthcare','healthcare:speciality','historic',
'image','landuse','leisure','name','nursery','office','official_name','old_name',
'opening_date','opening_hours','osm_id','osm_type','phone','playground','preschool',
'recycling_type','religion','self_service','service','shelter','shop','short_name',
'social_facility','social_facility:for','source','sport','tourism','type','url',
'website','wikidata','wikipedia'
]
overture:
release_date: null # null = auto-detect latest
s3_bucket: "overturemaps-us-west-2"
s3_region: "us-west-2"
# DuckDB resource caps for the per-part S3 scans and the final polygon
# filter. Peak host RAM ~= workers * memory_limit, peak CPU ~= workers *
# threads. Scale per-worker values down if raising workers above 1.
duckdb:
memory_limit: "4GB"
threads: 2
workers: 2
# (L0, L1) allowlist. L1 = null means "all of this L0".
# Entries intentionally exclude office/B2B-style L1s (corporate offices,
# media services, etc.), transit/parking/airports (covered elsewhere), and
# private lodging (Airbnb-style — duplicates residential addresses).
taxonomy_allowlist:
- [food_and_drink, null]
- [shopping, null]
- [arts_and_entertainment, null]
- [sports_and_recreation, null]
- [health_care, null]
- [lodging, null]
- [cultural_and_historic, null]
- [education, null]
- [lifestyle_services, personal_or_beauty_service]
- [lifestyle_services, wellness_service]
- [lifestyle_services, animal_or_pet_service]
- [lifestyle_services, beauty_service]
- [lifestyle_services, food_service]
- [services_and_business, financial_service]
- [services_and_business, legal_service]
- [services_and_business, professional_service]
- [services_and_business, real_estate_service]
- [services_and_business, home_service]
- [services_and_business, family_service]
- [community_and_government, social_or_community_service]
- [community_and_government, government_office]
- [community_and_government, civic_organization]
- [community_and_government, public_facility]
- [community_and_government, public_safety_service]
- [travel_and_transportation, fueling_station]
- [travel_and_transportation, vehicle_service]
# Settings for OSM exploratory data analysis
osm_data:
tag_key: name
top_n_types: 10
timestamp_cols:
- obs_timestamp
- last_obs_timestamp
- last_tag_timestamp
apply_model:
model_stub: '20260422'
# Settings for scripts/models/osm_turnover.py (JAX turnover model)
osm_turnover_model:
# Overridable at the CLI via --model-type {constant,random_by_type}.
default_model_type: constant
var_prior: [-1.0, 5.0]
# Tight hyperprior on log_tau (random-effect scale for per-group logit_delta
# in RandomByTypeModel). Tau median ≈ exp(-2) ≈ 0.135 on the logit scale —
# shrinks per-group δ toward the global intercept.
logit_delta_var_prior: [-2.0, 0.5]
# Column in osm_observations.parquet for grouping random effects.
# "shared_label" = shared taxonomy category
group_key: shared_label
group_values: null
min_value_count: 5
# NUTS warmup (window adaptation) and retained-sample counts. Warmup should
# generally be >= n_samples for hierarchical models.
n_warmup: 500
n_samples: 500
# Number of independent chains (vmapped in parallel). n_chains > 1 enables
# R-hat and bulk ESS diagnostics at roughly linear wall-time cost on CPU.
n_chains: 4
save_full_model: true
# Directory definitions (used with config.get_dir_path())
directories:
osm_data:
versioned: true
path: ~/data/openpois/osm_data
files:
# US+PR full-history pipeline (PBF-based)
osm_changes: osm_changes.parquet
osm_versions: osm_versions.parquet
raw_history_pbf: us-internal.osh.pbf
filtered_history_pbf: us-pois.osh.pbf
time_filtered_history_pbf: us-pois-timefilt.osh.pbf
raw_pr_history_pbf: puerto-rico-internal.osh.pbf
filtered_pr_history_pbf: puerto-rico-pois.osh.pbf
time_filtered_pr_history_pbf: puerto-rico-pois-timefilt.osh.pbf
us_versions: us_osm_versions.parquet
us_changes: us_osm_changes.parquet
pr_versions: pr_osm_versions.parquet
pr_changes: pr_osm_changes.parquet
# Modelling-ready observations (one row per POI version × shared_label)
osm_observations: osm_observations.parquet
model_output:
versioned: true
path: ~/data/openpois/osm_turnover_model
files:
fitted_params: fitted_params.csv
param_draws: param_draws.csv
predictions: predictions.csv
diagnostics: diagnostics.csv
inference_data: inference_data.nc
snapshot_osm:
versioned: true
path: ~/data/openpois/snapshots/osm
files:
raw_pbf: us-latest.osm.pbf
filtered_pbf: us-pois.osm.pbf
raw_pr_pbf: puerto-rico-latest.osm.pbf
filtered_pr_pbf: puerto-rico-pois.osm.pbf
snapshot: osm_snapshot.parquet
rated_snapshot: osm_snapshot_rated.parquet
partitioned: osm_snapshot_partitioned
pmtiles: osm_snapshot.pmtiles
boundary:
versioned: false
path: ~/data/openpois/boundary
snapshot_overture:
versioned: true
path: ~/data/openpois/snapshots/overture
files:
snapshot: overture_snapshot.parquet
conflation:
versioned: true
path: ~/data/openpois/conflation
files:
conflated: conflated.parquet
match_diagnostics: match_diagnostics.parquet
partitioned: conflated_partitioned
pmtiles: conflated.pmtiles
summary_by_label: summary_by_label.csv
testing:
versioned: false
path: ~/data/openpois/testing
files:
osm_snippet: osm_snippet.csv
overture_snippet: overture_snippet.csv
# Settings for POI conflation
conflation:
overture_confidence_weight: 0.7
min_match_score: 0.50
max_radius_m: 200
default_radius_m: 100
distance_weight: 0.0
name_weight: 0.50
type_weight: 0.30
identifier_weight: 0.20
chunk_size: 500_000
chunk_target_pois: 200_000
# Overture-internal deduplication runs before OSM × Overture
# matching. Self-matches Overture POIs, groups them into clusters,
# and drops non-winners so they never reach the cross-source stage.
# See ``openpois.conflation.dedup_overture.mark_no_conflate``.
overture_internal_dedup:
enabled: true
min_match_score: 0.75
max_radius_m: 100
chunk_target_pois: 200_000
duckdb_memory_limit: "4GB"
test_bbox:
xmin: -122.45
ymin: 47.50
xmax: -122.25
ymax: 47.70
# Settings for publishing snapshots to Source Cooperative
# (https://source.coop/henryspatialanalysis/openpois). Source Coop is
# S3-compatible: uploads go to the literal bucket name below with keys
# prefixed by ``{repo_prefix}/``; public reads are served at
# ``{public_base_url}/``.
publish:
bucket: "us-west-2.opendata.source.coop"
repo_prefix: "henryspatialanalysis/openpois"
public_base_url: "https://data.source.coop/henryspatialanalysis/openpois"
credentials_file: "~/repos/openpois/.env.json"
geohash_precision_sort: 6 # ~0.6 km x 1.2 km; within-partition sort key for spatial row-group pruning
# Values surfaced in the per-version README. Set each round until the
# pipeline captures them automatically — see .claude/TODO.md.
version_metadata:
osm_snapshot_date: "2026-04-17" # YYYY-MM-DD Geofabrik download date
overture_release: "2026-04-15.0" # Overture Maps release ID, https://docs.overturemaps.org/release-calendar/
model_commit: null # null → use current git HEAD; set a short SHA to pin
# PMTiles generation — single-zoom archive at z14 for both OSM and conflated.
# Site's View.minZoom is 14; OpenLayers over-zooms past z14 natively so
# z15-20 render as lossless geometric scale-ups of the z14 tile.
pmtiles:
min_zoom: 14
max_zoom: 14
drop_strategy: "drop-densest-as-needed"
osm_layer_name: "osm_pois"
conflated_layer_name: "conflated_pois"
osm_properties:
- osm_id
- osm_type # node|way|relation — drives popup OSM link path
- source
- name
- conf_mean # drives confidence-based point coloring
- amenity
- shop
- leisure
- healthcare
- craft
- historic
- landuse
- office
- tourism
conflated_properties:
- unified_id
- source
- osm_id
- osm_type # node|way|relation — drives popup OSM link path
- shared_label
- conf_mean
- name
- brand
- match_score
- match_distance_m