forked from corinnabrungs/msn_tree_library
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbroadinstitute_client.py
More file actions
67 lines (54 loc) · 2.16 KB
/
broadinstitute_client.py
File metadata and controls
67 lines (54 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd
import logging
from tqdm import tqdm
import pandas_utils
from date_utils import iso_datetime_now
from meta_constants import MetaColumns
from pandas_utils import (
left_merge_retain_index,
add_column_prefix,
update_dataframes,
create_missing_columns,
read_dataframe,
)
from drug_utils import map_clinical_phase_to_number
tqdm.pandas()
def broad_list_search(df):
from rdkit_mol_identifiers import split_inchikey
from drug_utils import map_clinical_phase_to_number
if MetaColumns.split_inchikey not in df and MetaColumns.inchikey in df:
df[MetaColumns.split_inchikey] = [
split_inchikey(inchikey) for inchikey in df[MetaColumns.inchikey]
]
# only merge on id column where id is notnull
results = df[[MetaColumns.split_inchikey]][
df[MetaColumns.split_inchikey].notnull()
].copy()
if len(results) == 0:
return df
logging.info("Search broad institute list of drugs by first block of inchikey")
# download from: https://clue.io/repurposing#download-data
prefix = "broad_"
broad_df = read_dataframe("data/broad_institute_drug_list.csv")
broad_df[MetaColumns.split_inchikey] = [
split_inchikey(inchikey) for inchikey in broad_df["InChIKey"]
]
broad_df = broad_df.drop(columns=["InChIKey"])
# need unique split_inchikey rows for broad to merge later
broad_df = broad_df.drop_duplicates(subset=MetaColumns.split_inchikey)
results = left_merge_retain_index(results, broad_df, on=MetaColumns.split_inchikey)
results = add_column_prefix(
results, prefix, columns_to_keep=MetaColumns.split_inchikey
)
# converting the clinical phases to numbers (remove phase,
# preclinic (as 0.5), or launched)
results = create_missing_columns(results, ["broad_clinical_phase"])
results["broad_clinical_phase"] = [
map_clinical_phase_to_number(phase) for phase in results["broad_clinical_phase"]
]
# results = pandas_utils.make_str_floor_to_int_number(
# df,
# ["broad_clinical_phase"],
# )
results[MetaColumns.date_broad_drug_list] = iso_datetime_now()
return update_dataframes(results, df)