forked from corinnabrungs/msn_tree_library
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathchembl_client.py
More file actions
149 lines (129 loc) · 5.35 KB
/
chembl_client.py
File metadata and controls
149 lines (129 loc) · 5.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import logging
import numpy as np
import pandas as pd
from chembl_webresource_client.new_client import new_client as chembl
import synonyms
from date_utils import create_expired_entries_dataframe, iso_datetime_now
from meta_constants import MetaColumns
from pandas_utils import (
notnull,
isnull,
update_dataframes,
make_str_floor_to_int_number,
get_unique_list,
notnull_not_empty,
isnull_or_empty,
)
from drug_utils import map_clinical_phase_to_number
from tqdm import tqdm
import datetime as dt
def get_chembl_mol(chembl_id=None, inchikey=None):
try:
if isnull_or_empty(chembl_id) and isnull_or_empty(inchikey):
return None
if notnull_not_empty(chembl_id):
comp = chembl.molecule.get(chembl_id)
if comp:
return comp
compounds = None
if not compounds and notnull_not_empty(inchikey):
compounds = chembl.molecule.filter(
molecule_structures__standard_inchi_key=inchikey
)
if not compounds:
logging.info(
"NO ChEMBL FOR: chemblid: {} or inchikey: {}".format(
chembl_id, inchikey
)
)
return None
else:
return compounds[0]
except Exception as e:
logging.warning("Error during chembl query:", e)
return None
def chembl_search_id_and_inchikey(
df, refresh_expired_entries_after: dt.timedelta = dt.timedelta(days=90)
) -> pd.DataFrame:
logging.info("Search ChEMBL by chemblid or inchikey")
if "chembl_id" not in df.columns:
df["chembl_id"] = None
# only work on expired elements
# define which rows are old or were not searched before
filtered = create_expired_entries_dataframe(
df, MetaColumns.date_chembl_search, refresh_expired_entries_after
)
filtered["result_column"] = [
get_chembl_mol(chembl_id, inchikey)
for chembl_id, inchikey in tqdm(
zip(filtered["chembl_id"], filtered["inchikey"]), total=len(filtered)
)
]
filtered = filtered[filtered["result_column"].notnull()].copy()
compounds = filtered["result_column"]
# refresh date
filtered[MetaColumns.date_chembl_search] = iso_datetime_now()
filtered["chembl_id"] = [compound["molecule_chembl_id"] for compound in compounds]
# filtered["compound_name"] = filtered["compound_name"] + [compound["pref_name"] for compound in compounds]
filtered["prodrug"] = [compound["prodrug"] for compound in compounds]
filtered["availability"] = [compound["availability_type"] for compound in compounds]
filtered["chembl_clinical_phase"] = [
compound["max_phase"] for compound in compounds
]
filtered["chembl_clinical_phase"] = [
map_clinical_phase_to_number(phase)
for phase in filtered["chembl_clinical_phase"]
]
filtered["withdrawn"] = [compound["withdrawn_flag"] for compound in compounds]
filtered[MetaColumns.first_approval] = pd.array(
[compound["first_approval"] for compound in compounds], dtype=pd.Int64Dtype()
)
filtered = make_str_floor_to_int_number(filtered, MetaColumns.first_approval)
filtered["oral"] = [compound["oral"] for compound in compounds]
filtered["parenteral"] = [compound["parenteral"] for compound in compounds]
filtered["topical"] = [compound["topical"] for compound in compounds]
filtered["natural_product"] = [
compound["natural_product"] for compound in compounds
]
filtered["usan_stem_definition"] = [
compound["usan_stem_definition"] for compound in compounds
]
filtered["chembl_indication"] = [
compound["indication_class"] for compound in compounds
]
filtered["chembl_atc_classifications"] = [
compound["atc_classifications"] for compound in compounds
]
# properties sometimes None
props = [compound["molecule_properties"] for compound in compounds]
filtered["molecular_species"] = [
prop["molecular_species"] if notnull(prop) else None for prop in props
]
filtered["chembl_alogp"] = [
prop["alogp"] if notnull(prop) else None for prop in props
]
filtered["chembl_cx_logp"] = [
prop["cx_logp"] if notnull(prop) else None for prop in props
]
# was changed by ChEMBL api
# filtered["withdrawn_class"] = [compound["withdrawn_class"] for compound in compounds]
# filtered["withdrawn_reason"] = [compound["withdrawn_reason"] for compound in compounds]
# filtered["withdrawn_year"] = pd.array([compound["withdrawn_year"] for compound in compounds], dtype=pd.Int64Dtype())
# filtered["withdrawn_country"] = [compound["withdrawn_country"] for compound in compounds]
# add new synonyms
new_synonyms = [extract_synonyms(compound) for compound in compounds]
filtered = synonyms.add_synonyms_columns(
filtered, new_synonyms=new_synonyms, prepend=False
)
# combine new data with old rows that were not processed
return update_dataframes(filtered, df).drop(
columns=["result_column"], errors="ignore"
)
def extract_synonyms(compound) -> list:
if isnull(compound):
return []
synonym_dict = compound["molecule_synonyms"]
if isnull(synonym_dict) or len(synonym_dict) == 0:
return []
new_synonyms = [syn["molecule_synonym"].strip() for syn in synonym_dict]
return get_unique_list(new_synonyms)