-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTargetScan_DB_Parser.py
More file actions
133 lines (126 loc) · 8.22 KB
/
TargetScan_DB_Parser.py
File metadata and controls
133 lines (126 loc) · 8.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import requests
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import load_workbook
import time
import re
import os
import sys
import datetime
time_start = datetime.datetime.now()
print (f"\nTime of start - {time_start.isoformat(sep=' ', timespec='seconds')}")
with pd.ExcelWriter('TargetScan_output_table.xlsx') as writer:
print('Writing to the outfile...')
pd_df = pd.DataFrame({'miRNA': [], ### output file creation + header printing
'Ortholog of target gene': [],
'Gene name': [],
'3P-seq tags + 5': [],
'Conservative total sites': [],
'Conserved 8mer sites': [],
'Conserved 7mer-m8 sites': [],
'Conserved 7mer-A1 sites': [],
'Poorly conserved sites total': [],
'Poorly conserved 8mer sites': [],
'Poorly conserved 7mer-m8 sites': [],
'Poorly conserved 7mer-A1 sites': [],
'6mer sites': [],
'Representative miRNA': [],
'Cumulative weighted context++ score': [],
'Target total context++ score': []})
pd_df.to_excel(writer, sheet_name='TargetScan', header=True, index=False)
in_values = pd.read_excel('total_m_mi.xlsx', sheet_name='total miRNA')['miRNA']
num = 0
max_row = 1
empty_table = []
no_page = []
no_table = []
for miRNA in in_values:
num +=1
pd_df = pd.DataFrame({'miRNA': [],
'Ortholog of target gene': [],
'Gene name': [],
'3P-seq tags + 5': [],
'Conservative total sites': [],
'Conserved 8mer sites': [],
'Conserved 7mer-m8 sites': [],
'Conserved 7mer-A1 sites': [],
'Poorly conserved sites total': [],
'Poorly conserved 8mer sites': [],
'Poorly conserved 7mer-m8 sites': [],
'Poorly conserved 7mer-A1 sites': [],
'6mer sites': [],
'Representative miRNA': [],
'Cumulative weighted context++ score': [],
'Target total context++ score': []})
try:
url_TargetScan = f"""http://www.targetscan.org/cgi-bin/targetscan/vert_72/targetscan.cgi?species=Rat&gid=&mir_sc=&mir_c=&mir_nc=&mir_vnc=&mirg={miRNA}"""
resp = requests.get(url_TargetScan) ### GET the main URL page for this miRNA
soup = BeautifulSoup(resp.text, 'html.parser')
download_link_1 = soup.find_all(href=re.compile('species'), string='Download table')[0]
url_download_link_1 = 'http://www.targetscan.org/cgi-bin/targetscan/vert_72/' + download_link_1.get('href')
resp_download = requests.get(url_download_link_1) ### GET page with downloadables
soup = BeautifulSoup(resp_download.text, 'html.parser')
download_link_2 = soup.find_all(href=re.compile('.xlsx'))[0]
url_download_link_2 = 'http://www.targetscan.org/' + download_link_2.get('href')
download_table = requests.get(url_download_link_2) ### downloading .xlsx table for the miRNA
open(f"{miRNA}_output.xlsx", 'wb').write(download_table.content)
pd_df1 = pd.read_excel(f"{miRNA}_output.xlsx")
pd_df1 = pd_df1.drop(columns=['Representative transcript']) ### drop useless columns
pd_df1 = pd_df1.loc[pd_df1['Total context++ score']<= -0.5] ### filter for score values
if len(pd_df1.columns) == 17:
pd_df['Ortholog of target gene'] = pd_df1['Ortholog of target gene']
pd_df['Gene name'] = pd_df1['Gene name']
pd_df['3P-seq tags + 5'] = pd_df1['3P-seq tags + 5']
pd_df['Conservative total sites'] = pd_df1['Conserved sites total']
pd_df['Conserved 8mer sites'] = pd_df1['Conserved 8mer sites']
pd_df['Conserved 7mer-m8 sites'] = pd_df1['Conserved 7mer-m8 sites']
pd_df['Conserved 7mer-A1 sites'] = pd_df1['Conserved 7mer-A1 sites']
pd_df['Poorly conserved sites total'] = pd_df1['Poorly conserved sites total']
pd_df['Poorly conserved 8mer sites'] = pd_df1['Poorly conserved 8mer sites']
pd_df['Poorly conserved 7mer-m8 sites'] = pd_df1['Poorly conserved 7mer-m8 sites']
pd_df['Poorly conserved 7mer-A1 sites'] = pd_df1['Poorly conserved 7mer-A1 sites']
pd_df['6mer sites'] = pd_df1['6mer sites']
pd_df['Representative miRNA'] = pd_df1['Representative miRNA']
pd_df['Cumulative weighted context++ score'] = pd_df1['Cumulative weighted context++ score']
pd_df['Target total context++ score'] = pd_df1['Total context++ score'] ### adding content for large tables
elif len(pd_df1.columns) == 12:
pd_df['Ortholog of target gene'] = pd_df1['Ortholog of target gene']
pd_df['Gene name'] = pd_df1['Gene name']
pd_df['3P-seq tags + 5'] = pd_df1['3P-seq tags + 5']
pd_df['Conservative total sites'] = pd_df1['Total sites']
pd_df['Conserved 8mer sites'] = pd_df1['8mer sites']
pd_df['Conserved 7mer-m8 sites'] = pd_df1['7mer-m8 sites']
pd_df['Conserved 7mer-A1 sites'] = pd_df1['7mer-A1 sites']
pd_df['6mer sites'] = pd_df1['6mer sites']
pd_df['Representative miRNA'] = pd_df1['Representative miRNA']
pd_df['Cumulative weighted context++ score'] = pd_df1['Cumulative weighted context++ score']
pd_df['Target total context++ score'] = pd_df1['Total context++ score'] ### adding content for small tables
pd_df.loc[:, 'miRNA'] = miRNA ### adding the requested miRNA
print(f"Adding {len(pd_df.index)} rows for {miRNA}...")
if len(pd_df.index) == 0:
empty_table.append(miRNA)
pd_df.to_excel(writer, sheet_name='TargetScan', header=False, index=False, startrow=max_row) ### write to the outfile
os.remove(f"{miRNA}_output.xlsx") ### delete downloaded file
max_row += len(pd_df.index)
except AttributeError:
no_page.append(miRNA)
print(f"No {miRNA} was found in TargetScan DB... Skipping...") ### if None info was found for this miRNA
except IndexError:
no_table.append(miRNA)
print(f"No downloadable table was found for {miRNA}... Skipping...") ### no download link is found
except ConnectionError:
print(f"Connection error for {miRNA}. Skipping...")
finally:
time.sleep(1)
print(f"{num}/{len(in_values)}", end='\n')
time_end = datetime.datetime.now()
print('\nNo rows were added for this list of miRNA\'s: ')
for k in empty_table:
print(k, end=' ')
print('\nNo page or downloadable link has been found for: ')
for i in no_page:
print(i, end=' ')
for j in no_table:
print(j, end=' ')
print(f"\nTime of finish - {time_end.isoformat(sep=' ', timespec='seconds')}. Time of executing - {time_end - time_start}.")
print('\nDone!')