Skip to content

Commit 24e0187

Browse files
committed
Fix BindingDB live-data parser parity
1 parent 07ca727 commit 24e0187

6 files changed

Lines changed: 98 additions & 31 deletions

File tree

orion/metadata_aggregation.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ def _apply_reducer(
3030
row: dict[str, Any],
3131
item: Any,
3232
) -> None:
33+
when_spec = reducer_spec.get("when")
34+
if when_spec is not None and not evaluate_transform(when_spec, row=row, item=item):
35+
return
36+
3337
reducer_op = reducer_spec["op"]
3438

3539
if reducer_op == "collect_list":

orion/metadata_transforms.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -170,15 +170,23 @@ def evaluate_transform(
170170
if _is_missing(value):
171171
return None
172172
if isinstance(value, (int, float)):
173-
return float(value)
174-
normalized = str(value).strip().replace(",", "")
175-
for operator in spec.get("reject_operators", []):
176-
if normalized.startswith(operator):
177-
return None
178-
for operator in spec.get("strip_operators", ["<"]):
179-
if normalized.startswith(operator):
180-
normalized = normalized[len(operator):]
181-
return float(normalized)
173+
parsed_value = float(value)
174+
else:
175+
normalized = str(value).strip().replace(",", "")
176+
for operator in spec.get("reject_operators", []):
177+
if normalized.startswith(operator):
178+
return None
179+
for operator in spec.get("strip_operators", ["<"]):
180+
if normalized.startswith(operator):
181+
normalized = normalized[len(operator):]
182+
parsed_value = float(normalized)
183+
minimum_exclusive = spec.get("minimum_exclusive")
184+
if minimum_exclusive is not None and parsed_value <= float(minimum_exclusive):
185+
return None
186+
minimum_inclusive = spec.get("minimum_inclusive")
187+
if minimum_inclusive is not None and parsed_value < float(minimum_inclusive):
188+
return None
189+
return parsed_value
182190

183191
if op == "aggregate_value":
184192
if aggregate is None:

parser_specs/BINDING-DB/parser.yaml

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,7 @@ row_filters:
4444
- exists: chain1_swissprot_primary_id
4545

4646
emit:
47-
nodes:
48-
- id:
49-
op: field
50-
name: ligand_id
51-
- id:
52-
op: field
53-
name: protein_id
47+
nodes: []
5448
edges: []
5549

5650
aggregate:
@@ -76,20 +70,45 @@ aggregate:
7670
path: value
7771
reject_operators: [">"]
7872
strip_operators: ["<"]
73+
minimum_exclusive: 0
7974
publications:
8075
op: collect_unique
76+
when:
77+
op: parse_qualified_float
78+
value:
79+
op: item
80+
path: value
81+
reject_operators: [">"]
82+
strip_operators: ["<"]
83+
minimum_exclusive: 0
8184
value:
8285
op: prefix_if_present
8386
field: pmid
8487
prefix: "PMID:"
8588
pubchem_assay_ids:
8689
op: collect_unique
90+
when:
91+
op: parse_qualified_float
92+
value:
93+
op: item
94+
path: value
95+
reject_operators: [">"]
96+
strip_operators: ["<"]
97+
minimum_exclusive: 0
8798
value:
8899
op: prefix_if_present
89100
field: pubchem_aid
90101
prefix: "PUBCHEM.AID:"
91102
patent_ids:
92103
op: collect_unique
104+
when:
105+
op: parse_qualified_float
106+
value:
107+
op: item
108+
path: value
109+
reject_operators: [">"]
110+
strip_operators: ["<"]
111+
minimum_exclusive: 0
93112
value:
94113
op: prefix_if_present
95114
field: patent_number
@@ -114,6 +133,13 @@ aggregate:
114133
filters:
115134
- non_empty: supporting_affinities_nm
116135
emit:
136+
nodes:
137+
- id:
138+
op: group_key
139+
index: 0
140+
- id:
141+
op: group_key
142+
index: 1
117143
edges:
118144
- subject:
119145
op: group_key

parsers/BINDING/src/loadBINDINGDB.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
1+
import csv
12
import os
23
import enum
34
import math
45
import json
56
import requests
67

8+
from io import TextIOWrapper
79
from zipfile import ZipFile
810
from requests.adapters import HTTPAdapter, Retry
911

@@ -32,11 +34,14 @@ class BD_EDGEUMAN(enum.IntEnum):
3234
def negative_log(concentration_nm): ### This function converts nanomolar concentrations into log-scale units (pKi/pKd/pIC50/pEC50). ###
3335
return -(math.log10(concentration_nm*(10**-9)))
3436

35-
def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\\t'):
37+
def generate_zipfile_rows(zip_file_path, file_inside_zip, delimiter='\t'):
3638
with ZipFile(zip_file_path, 'r') as zip_file:
37-
with zip_file.open(file_inside_zip, 'r') as file:
38-
for line in file:
39-
yield str(line).split(delimiter)
39+
with zip_file.open(file_inside_zip, 'r') as raw_file:
40+
text_file = TextIOWrapper(raw_file, encoding='utf-8', newline='')
41+
reader = csv.reader(text_file, delimiter=delimiter)
42+
for row in reader:
43+
if row:
44+
yield row
4045

4146

4247
##############
@@ -138,6 +143,8 @@ def parse_data(self) -> dict:
138143
break
139144
if n%100000 == 0:
140145
self.logger.debug(f'processed {n} rows so far...')
146+
if len(row) <= BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value:
147+
continue
141148
ligand = row[BD_EDGEUMAN.PUBCHEM_CID.value]
142149
protein = row[BD_EDGEUMAN.UNIPROT_TARGET_CHAIN.value]
143150
if (ligand == '') or (protein == ''): # Check if Pubchem or UniProt ID is missing.

tests/resources/metadata_parser/bindingdb/parser.yaml

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -44,17 +44,7 @@ row_filters:
4444
- exists: chain1_swissprot_primary_id
4545

4646
emit:
47-
nodes:
48-
- id:
49-
op: field
50-
name: ligand_id
51-
categories:
52-
- biolink:SmallMolecule
53-
- id:
54-
op: field
55-
name: protein_id
56-
categories:
57-
- biolink:Protein
47+
nodes: []
5848
edges: []
5949

6050
aggregate:
@@ -78,20 +68,39 @@ aggregate:
7868
value:
7969
op: item
8070
path: value
71+
minimum_exclusive: 0
8172
publications:
8273
op: collect_unique
74+
when:
75+
op: parse_qualified_float
76+
value:
77+
op: item
78+
path: value
79+
minimum_exclusive: 0
8380
value:
8481
op: prefix_if_present
8582
field: pmid
8683
prefix: "PMID:"
8784
pubchem_assay_ids:
8885
op: collect_unique
86+
when:
87+
op: parse_qualified_float
88+
value:
89+
op: item
90+
path: value
91+
minimum_exclusive: 0
8992
value:
9093
op: prefix_if_present
9194
field: pubchem_aid
9295
prefix: "PUBCHEM.AID:"
9396
patent_ids:
9497
op: collect_unique
98+
when:
99+
op: parse_qualified_float
100+
value:
101+
op: item
102+
path: value
103+
minimum_exclusive: 0
95104
value:
96105
op: prefix_if_present
97106
field: patent_number
@@ -107,6 +116,17 @@ aggregate:
107116
name: average_affinity_nm
108117
precision: 2
109118
emit:
119+
nodes:
120+
- id:
121+
op: group_key
122+
index: 0
123+
categories:
124+
- biolink:SmallMolecule
125+
- id:
126+
op: group_key
127+
index: 1
128+
categories:
129+
- biolink:Protein
110130
edges:
111131
- subject:
112132
op: group_key

tests/test_metadata_driven_parser.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,11 @@ def row(pubchem_cid, protein, ki="", ic50="", kd="", ec50="", pmid="", aid="", p
6363
header,
6464
row("111", "P11111", ki="100", pmid="12345", aid="7001", patent="PAT-1"),
6565
row("111", "P11111", ki="10", pmid="23456", aid="7002", patent="PAT-1"),
66+
row("111", "P11111", ki="0", pmid="34567", aid="7003"),
6667
row("111", "P11111", ic50="200", pmid="12345", aid="7001"),
6768
row("222", "P22222", ec50="50", pmid="34567", aid="8001", patent="PAT-2"),
6869
row("", "P99999", ki="25", pmid="99999", aid="9999", patent="PAT-X"),
70+
["malformed", "row"],
6971
]
7072

7173
tsv_content = "\n".join("\t".join(row_values) for row_values in rows) + "\n"

0 commit comments

Comments
 (0)