Skip to content

Commit 3149d17

Browse files
authored
Merge pull request #30 from VACLab/duckdb-read-only
added optional read-only flag with default True to API
2 parents cde35ee + d49ded8 commit 3149d17

10 files changed

Lines changed: 156 additions & 140 deletions

File tree

biasanalyzer/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def set_config(self, config_file_path: str):
4040
except ValidationError as ex:
4141
notify_users(f"configuration yaml file is not valid with validation error: {ex}", level="error")
4242

43-
def set_root_omop(self):
43+
def set_root_omop(self, read_only=True):
4444
if not self.config:
4545
notify_users(
4646
"no valid configuration to set root OMOP CDM data. "
@@ -62,7 +62,7 @@ def set_root_omop(self):
6262
self.bias_db = BiasDatabase(":memory:", omop_db_url=db_url)
6363
elif db_type == "duckdb":
6464
db_path = self.config["root_omop_cdm_database"].get("database", ":memory:")
65-
self.omop_cdm_db = OMOPCDMDatabase(db_path)
65+
self.omop_cdm_db = OMOPCDMDatabase(db_path, read_only=read_only)
6666
self.bias_db = BiasDatabase(":memory:", omop_db_url=db_path)
6767
else:
6868
notify_users(f"Unsupported database type: {db_type}")

biasanalyzer/cohort.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from biasanalyzer.concept import ConceptHierarchy
1111
from biasanalyzer.config import load_cohort_creation_config
1212
from biasanalyzer.database import BiasDatabase, OMOPCDMDatabase
13-
from biasanalyzer.models import CohortDefinition, DOMAIN_MAPPING
13+
from biasanalyzer.models import DOMAIN_MAPPING, CohortDefinition
1414
from biasanalyzer.utils import clean_string, hellinger_distance, notify_users
1515

1616

@@ -60,7 +60,7 @@ def get_concept_stats(
6060
Get cohort concept statistics such as concept prevalence
6161
"""
6262
if concept_type not in DOMAIN_MAPPING:
63-
raise ValueError(f'input concept_type {concept_type} is not a valid concept type to get concept stats')
63+
raise ValueError(f"input concept_type {concept_type} is not a valid concept type to get concept stats")
6464

6565
cohort_stats = self.bias_db.get_cohort_concept_stats(
6666
self.cohort_id,

biasanalyzer/database.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# ruff: noqa: S608
22
import gc
3-
import platform
43
from datetime import datetime
54
from typing import Optional
65

@@ -322,13 +321,13 @@ class OMOPCDMDatabase:
322321
_instance = None # indicating a singleton with only one instance of the class ever created
323322
_database_type = None
324323

325-
def __new__(cls, *args, **kwargs):
324+
def __new__(cls, db_url, read_only=True):
326325
if cls._instance is None:
327326
cls._instance = super().__new__(cls)
328-
cls._instance._initialize(*args, **kwargs) # Initialize only once
327+
cls._instance._initialize(db_url, read_only=read_only) # Initialize only once
329328
return cls._instance
330329

331-
def _initialize(self, db_url):
330+
def _initialize(self, db_url, read_only=True):
332331
if db_url.endswith(".duckdb"):
333332
# close any potential global connections if any
334333
for obj in gc.get_objects(): # pragma: no cover
@@ -340,11 +339,8 @@ def _initialize(self, db_url):
340339

341340
# Handle DuckDB connection
342341
try:
343-
if platform.system().lower() == "windows": # pragma: no cover
344-
# it is critical to set duckdb connection to be read-only on windows platform
345-
self.engine = duckdb.connect(db_url, read_only=True)
346-
else:
347-
self.engine = duckdb.connect(db_url)
342+
# it is critical to set duckdb connection to be read-only on windows and Mac platforms
343+
self.engine = duckdb.connect(db_url, read_only=read_only)
348344
notify_users(f"Connected to the DuckDB database: {db_url}.")
349345
except duckdb.Error as e: # pragma: no cover
350346
notify_users(f"Failed to connect to DuckDB: {e}", level="error")
@@ -573,4 +569,4 @@ def close(self):
573569
else:
574570
self.engine.dispose() # pragma: no cover
575571
OMOPCDMDatabase._instance = None
576-
notify_users("Connection to the OMOP CDM database closed.")
572+
notify_users("Connection to the OMOP CDM database closed.")

poetry.lock

Lines changed: 91 additions & 61 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ include = [
1010
{path = "biasanalyzer/sql_templates/*.sql", format=["sdist", "wheel"]}
1111
]
1212
[tool.poetry.dependencies]
13-
python = ">=3.8.10,<3.13"
13+
python = ">=3.9,<3.13"
1414
duckdb = "^1.1.1"
15-
pandas = "2.0.3"
15+
pandas = "^2.1.4"
1616

1717
scipy = [
1818
{version = ">=1.10.1,<1.11", markers = "python_version<'3.12'"},

scripts/ingest_csvs_to_omop_duckdb.py

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -11,45 +11,30 @@
1111
"""
1212

1313
import argparse
14-
import sys
1514
import csv
15+
import sys
1616
import time
1717
from pathlib import Path
1818

1919
import duckdb
2020

21-
2221
FILENAME_STEM_TO_TABLE_NAME_MAPPING = {
2322
# 'demographics': 'person'
2423
# 'conditions': 'condition_occurrence'
2524
# 'drugs': 'drug_exposure'
2625
# 'procedures': 'procedure_occurrence'
2726
# 'visits': 'visit_occurrence'
28-
'observations': 'observation'
27+
"observations": "observation"
2928
}
3029

3130
COLUMN_MAPPINGS = {
32-
"person": {
33-
"deid_pat_id": "person_id"
34-
},
35-
"condition_occurrence": {
36-
"deid_pat_id": "person_id"
37-
},
38-
"drug_exposure": {
39-
"deid_pat_id": "person_id"
40-
},
41-
"procedure_occurrence": {
42-
"deid_pat_id": "person_id"
43-
},
44-
"visit_occurrence": {
45-
"deid_pat_id": "person_id"
46-
},
47-
"observation": {
48-
"deid_pat_id": "person_id"
49-
},
50-
"measurement": {
51-
"deid_pat_id": "person_id"
52-
},
31+
"person": {"deid_pat_id": "person_id"},
32+
"condition_occurrence": {"deid_pat_id": "person_id"},
33+
"drug_exposure": {"deid_pat_id": "person_id"},
34+
"procedure_occurrence": {"deid_pat_id": "person_id"},
35+
"visit_occurrence": {"deid_pat_id": "person_id"},
36+
"observation": {"deid_pat_id": "person_id"},
37+
"measurement": {"deid_pat_id": "person_id"},
5338
}
5439

5540
OMOP_TABLE_SCHEMAS = {
@@ -71,7 +56,7 @@
7156
"race_source_value",
7257
"race_source_concept_id",
7358
"ethnicity_source_value",
74-
"ethnicity_source_concept_id"
59+
"ethnicity_source_concept_id",
7560
],
7661
"condition_occurrence": [
7762
"condition_occurrence_id",
@@ -89,9 +74,9 @@
8974
"visit_detail_id",
9075
"condition_source_value",
9176
"condition_source_concept_id",
92-
"condition_status_source_value"
77+
"condition_status_source_value",
9378
],
94-
'drug_exposure': [
79+
"drug_exposure": [
9580
"drug_exposure_id",
9681
"person_id",
9782
"drug_concept_id",
@@ -114,9 +99,9 @@
11499
"drug_source_value",
115100
"drug_source_concept_id",
116101
"route_source_value",
117-
"dose_unit_source_value"
102+
"dose_unit_source_value",
118103
],
119-
'procedure_occurrence': [
104+
"procedure_occurrence": [
120105
"procedure_occurrence_id",
121106
"person_id",
122107
"procedure_concept_id",
@@ -132,9 +117,9 @@
132117
"visit_detail_id",
133118
"procedure_source_value",
134119
"procedure_source_concept_id",
135-
"modifier_source_value"
120+
"modifier_source_value",
136121
],
137-
'visit_occurrence': [
122+
"visit_occurrence": [
138123
"visit_occurrence_id",
139124
"person_id",
140125
"visit_concept_id",
@@ -151,9 +136,9 @@
151136
"admitted_from_source_value",
152137
"discharged_to_concept_id",
153138
"discharged_to_source_value",
154-
"preceding_visit_occurrence_id"
139+
"preceding_visit_occurrence_id",
155140
],
156-
'observation': [
141+
"observation": [
157142
"observation_id",
158143
"person_id",
159144
"observation_concept_id",
@@ -174,28 +159,29 @@
174159
"qualifier_source_value",
175160
"value_source_value",
176161
"observation_event_id",
177-
"obs_event_field_concept_id"
178-
]
162+
"obs_event_field_concept_id",
163+
],
179164
}
180165

166+
181167
def load_csv_to_duckdb(con, csv_path: Path, table_name: str):
182168
"""Load a single CSV file into DuckDB."""
183169
t0 = time.time()
184170
print(f"loading {table_name} from {csv_path}")
185171

186172
# read and normalize header
187-
with open(csv_path, "r", newline="") as f:
173+
with open(csv_path, newline="") as f:
188174
reader = csv.reader(f)
189175
raw_header = next(reader)
190176

191177
# normalize: lower case + strip quotes/spaces
192-
raw_header = [h.strip().replace('"', '') for h in raw_header]
178+
raw_header = [h.strip().replace('"', "") for h in raw_header]
193179
header = [h.lower() for h in raw_header]
194-
print(f'normalized header: {header}')
180+
print(f"normalized header: {header}")
195181

196182
mapping = COLUMN_MAPPINGS.get(table_name, {})
197183
final_cols = [mapping.get(col, col) for col in header]
198-
print(f'mapped header: {final_cols}')
184+
print(f"mapped header: {final_cols}")
199185

200186
expected = OMOP_TABLE_SCHEMAS.get(table_name, [])
201187
final_set = set(final_cols)
@@ -209,15 +195,15 @@ def load_csv_to_duckdb(con, csv_path: Path, table_name: str):
209195
extra = final_set - set(expected)
210196
if extra:
211197
print(f"WARNING: Extra columns in CSV for {table_name}: {sorted(extra)}")
212-
print(f"Extra columns will NOT be ingested.")
198+
print("Extra columns will NOT be ingested.")
213199

214200
select_clauses = []
215201
for orig, new in zip(raw_header, final_cols):
216202
if new not in expected:
217203
# skip extra columns entirely
218204
continue
219205
if orig != new:
220-
select_clauses.append(f'{orig} AS {new}')
206+
select_clauses.append(f"{orig} AS {new}")
221207
else:
222208
select_clauses.append(orig)
223209

@@ -268,9 +254,13 @@ def main():
268254
required=False,
269255
help="Directory containing OMOP vocabulary CSVs (concept, concept_relationship, etc.)",
270256
)
271-
parser.add_argument("--output", type=Path,
272-
default=Path("Y:/OMOP_duckdb/omop.duckdb"),
273-
required=False, help="Output DuckDB file path")
257+
parser.add_argument(
258+
"--output",
259+
type=Path,
260+
default=Path("Y:/OMOP_duckdb/omop.duckdb"),
261+
required=False,
262+
help="Output DuckDB file path",
263+
)
274264

275265
args = parser.parse_args()
276266

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def test_db():
273273

274274
# mock configuration file
275275
bias = BIAS(config_file_path=config_file)
276-
bias.set_root_omop()
276+
bias.set_root_omop(read_only=False)
277277

278278
yield bias # Provide the connection to the test
279279

tests/query_based/test_cohort_creation.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,10 @@ def test_cohort_creation_baseline(caplog, test_db):
8686

8787
patient_ids = set([item["subject_id"] for item in cohort.data])
8888
assert_equal(len(patient_ids), 5)
89-
assert_equal(patient_ids, {'106', '108', '110', '111', '112'})
89+
assert_equal(patient_ids, {"106", "108", "110", "111", "112"})
9090
# select two patients to check for cohort_start_date and cohort_end_date automatically computed
91-
patient_106 = next(item for item in cohort.data if item["subject_id"] == '106')
92-
patient_108 = next(item for item in cohort.data if item["subject_id"] == '108')
91+
patient_106 = next(item for item in cohort.data if item["subject_id"] == "106")
92+
patient_108 = next(item for item in cohort.data if item["subject_id"] == "108")
9393

9494
# Replace dates with actual values from your test data
9595
assert_equal(
@@ -127,7 +127,7 @@ def test_cohort_creation_study(test_db):
127127
assert cohort.data is not None, "Cohort creation wrongly returned None data"
128128
patient_ids = set([item["subject_id"] for item in cohort.data])
129129
assert_equal(len(patient_ids), 4)
130-
assert_equal(patient_ids, {'108', '110', '111', '112'})
130+
assert_equal(patient_ids, {"108", "110", "111", "112"})
131131

132132

133133
def test_cohort_creation_study2(caplog, test_db):
@@ -155,7 +155,7 @@ def test_cohort_creation_study2(caplog, test_db):
155155
assert cohort.data is not None, "Cohort creation wrongly returned None data"
156156
patient_ids = set([item["subject_id"] for item in cohort.data])
157157
assert_equal(len(patient_ids), 1)
158-
assert_equal(patient_ids, {'106'})
158+
assert_equal(patient_ids, {"106"})
159159

160160

161161
def test_cohort_creation_all(caplog, test_db):
@@ -191,7 +191,7 @@ def test_cohort_creation_all(caplog, test_db):
191191
patient_ids = set([item["subject_id"] for item in cohort.data])
192192
print(f"patient_ids: {patient_ids}", flush=True)
193193
assert_equal(len(patient_ids), 2)
194-
assert_equal(patient_ids, {'108', '110'})
194+
assert_equal(patient_ids, {"108", "110"})
195195

196196

197197
def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db):
@@ -214,7 +214,7 @@ def test_cohort_creation_multiple_temporary_groups_with_no_operator(test_db):
214214
patient_ids = set([item["subject_id"] for item in cohort.data])
215215
print(f"patient_ids: {patient_ids}", flush=True)
216216
assert_equal(len(patient_ids), 2)
217-
assert_equal(patient_ids, {'108', '110'})
217+
assert_equal(patient_ids, {"108", "110"})
218218

219219

220220
def test_cohort_creation_mixed_domains(test_db):
@@ -242,7 +242,7 @@ def test_cohort_creation_mixed_domains(test_db):
242242
patient_ids = set([item["subject_id"] for item in cohort.data])
243243
print(f"patient_ids: {patient_ids}", flush=True)
244244
assert_equal(len(patient_ids), 3)
245-
assert_equal(patient_ids, {'1', '2', '6'})
245+
assert_equal(patient_ids, {"1", "2", "6"})
246246
start_dates = [item["cohort_start_date"] for item in cohort.data]
247247
assert_equal(len(start_dates), 3)
248248
assert_equal(start_dates, [datetime.date(2020, 6, 1), datetime.date(2020, 6, 1), datetime.date(2018, 1, 1)])
@@ -356,10 +356,10 @@ def test_cohort_creation_negative_instance(test_db):
356356

357357
patient_ids = set([item["subject_id"] for item in cohort.data])
358358
assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5
359-
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
359+
assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"})
360360

361361
# Verify dates for a specific patient (e.g., patient 1 with last diabetes diagnosis)
362-
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
362+
patient_1 = next(item for item in cohort.data if item["subject_id"] == "1")
363363
assert_equal(
364364
patient_1["cohort_start_date"],
365365
datetime.date(2020, 6, 1),
@@ -392,10 +392,10 @@ def test_cohort_creation_offset(test_db):
392392

393393
patient_ids = set([item["subject_id"] for item in cohort.data])
394394
assert_equal(len(patient_ids), 6) # Female patients 1, 2, 3, 5
395-
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
395+
assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"})
396396

397397
# Verify dates for a specific patient (e.g., patient 1 with offset)
398-
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
398+
patient_1 = next(item for item in cohort.data if item["subject_id"] == "1")
399399
# Diabetes on 2020-06-01: -730 days = 2018-06-02, +180 days = 2020-11-28
400400
assert_equal(
401401
patient_1["cohort_start_date"],
@@ -435,10 +435,10 @@ def test_cohort_creation_negative_instance_offset(test_db):
435435

436436
patient_ids = set([item["subject_id"] for item in cohort.data])
437437
assert_equal(len(patient_ids), 6)
438-
assert_equal(patient_ids, {'1', '2', '3', '5', '6', '7'})
438+
assert_equal(patient_ids, {"1", "2", "3", "5", "6", "7"})
439439

440440
# Verify dates for a specific patient (e.g., patient 1 with last diabetes and offset)
441-
patient_1 = next(item for item in cohort.data if item["subject_id"] == '1')
441+
patient_1 = next(item for item in cohort.data if item["subject_id"] == "1")
442442
# Last diabetes on 2020-06-01: +180 days = 2020-11-28
443443
assert_equal(
444444
patient_1["cohort_start_date"],

0 commit comments

Comments
 (0)