-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript1.py
More file actions
144 lines (117 loc) · 5.79 KB
/
script1.py
File metadata and controls
144 lines (117 loc) · 5.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import re
import csv
import pandas as pd
import requests
from tqdm import tqdm
def find_valid_pdb_codes(text):
pattern = r'\b(?:[a-z][0-9]|[0-9][a-z])[a-z0-9]{2}\b'
pdb_codes = re.findall(pattern, text)
return pdb_codes
def retrieve_uniprot_ids(pdb_codes):
uniprot_ids = {}
failed_pdb_codes = []
for code in tqdm(pdb_codes, desc="Fetching UniProt IDs"):
url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{code}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
if code in data and "UniProt" in data[code]:
uniprot_ids[code] = next(iter(data[code]["UniProt"])) # Get the first (and only) UniProt ID
else:
failed_pdb_codes.append(code)
else:
failed_pdb_codes.append(code)
return uniprot_ids, failed_pdb_codes
def retrieve_fasta_content(uniprot_ids):
fasta_content = ""
for uniprot_id in tqdm(uniprot_ids.values(), desc="Fetching FASTA"):
url = f"https://www.uniprot.org/uniprot/{uniprot_id}.fasta"
response = requests.get(url)
if response.status_code == 200:
fasta_content += response.text
return fasta_content
if __name__ == "__main__":
# Fetch the current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)
# Prompt the user to enter only the file name
file_name = input("Enter the name of the file (txt or csv) containing PDB codes (e.g. input.txt): ")
# Join the current directory with the provided file name
file_path = os.path.join(current_directory, file_name)
# Print out the constructed file path
print("File path:", file_path)
# Verify if the file exists
if not os.path.exists(file_path):
print("File does not exist.")
exit()
# Read input text from the specified file
if file_name.endswith('.txt'):
with open(file_path, "r") as f:
example_text = f.read()
elif file_name.endswith('.csv'):
with open(file_path, "r") as f:
reader = csv.reader(f)
example_text = ' '.join([row[0] for row in reader])
else:
print("Invalid file format. Please provide a txt or csv file.")
exit()
# Prompt the user to enter the name of the query file
query_file_name = input("Enter the name of the fasta query file (e.g. query.fasta): ")
query_file_path = os.path.join(current_directory, query_file_name)
if not os.path.exists(query_file_path):
print("Query file does not exist.")
exit()
# Find valid PDB codes
valid_pdb_codes = find_valid_pdb_codes(example_text)
# Remove PDB duplicates
unique_pdb_codes = list(set(valid_pdb_codes))
# Retrieve UniProt IDs
uniprot_ids, failed_pdb_codes = retrieve_uniprot_ids(unique_pdb_codes)
# Retrieve fasta content
fasta_content = retrieve_fasta_content(uniprot_ids)
# Create DataFrame for successful retrievals
df = pd.DataFrame({'PDB code': list(uniprot_ids.keys()), 'UniProt ID': list(uniprot_ids.values())})
# Add sequence order
df['Order'] = range(1, len(df) + 1)
# Reorder columns
df = df[['Order', 'PDB code', 'UniProt ID']]
# Write successful results to CSV
output_complete_csv_file = "output_full_list.csv"
df.to_csv(output_complete_csv_file, index=False)
print(f"\nA total of {len(df)} unique PDB codes and corresponding UniProt IDs have been found and written to {output_complete_csv_file}")
# Write failed results to CSV
failed_csv_file = "out_uniprot_failed.csv"
with open(failed_csv_file, 'w', newline='') as csvfile:
fieldnames = ['PDB code', 'UniProt ID']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for pdb_code in failed_pdb_codes:
writer.writerow({'PDB code': pdb_code, 'UniProt ID': 'Failed to retrieve'})
print(f"A total of {len(failed_pdb_codes)} PDB codes failed to retrieve UniProt IDs and have been written to {failed_csv_file}")
# Write PDB codes to a second CSV file separated by commas
out_pdbs_csv_file = "out_pdbs.csv"
with open(out_pdbs_csv_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(unique_pdb_codes)
print(f"\nPDB codes have been written to {out_pdbs_csv_file}")
# Write UniProt IDs to a third CSV file separated by commas
out_uniprotids_csv_file = "out_uniprotids.csv"
with open(out_uniprotids_csv_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(uniprot_ids.values())
print(f"UniProt IDs have been written to {out_uniprotids_csv_file}")
# Write fasta content to a single output_all.fasta file
output_all_fasta_file = "output_all.fasta"
with open(output_all_fasta_file, 'w') as fasta_file:
fasta_file.write(fasta_content)
print(f"\nAll FASTA sequences have been concatenated and written to {output_all_fasta_file}")
# Run BLASTp with the query file and output_all.fasta
db_path = os.path.join(current_directory, output_all_fasta_file)
cmd_makeblastdb = f"makeblastdb -in {db_path} -dbtype prot"
os.system(cmd_makeblastdb)
# Run BLASTp with the query file and output_all.fasta
cmd_blastp = f"blastp -query {query_file_path} -db {output_all_fasta_file} -out output_blastp.txt -outfmt 7"
os.system(cmd_blastp)
output_blastp = "output_blastp.txt"
print(f"The alignment using BLASTp was completed successfully. \nPlease check details on {output_blastp} \n\nThank you for using my script! \nAuthor: Dr. Guilherme M. Silva - Harvard Medical School - BIDMC - guimsilva@gmail.com \n ")