-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblast_script.py~
More file actions
executable file
·89 lines (64 loc) · 3.69 KB
/
blast_script.py~
File metadata and controls
executable file
·89 lines (64 loc) · 3.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/python
from multiprocessing import Pool
import time
import os
import sys
import argparse
# Copyright(C) 2013 David Ream
# Released under Biopython license. http://www.biopython.org/DIST/LICENSE
# Do not remove this comment
def do_parallel_blast(arg_tuple):
db, query_file, blast_result_folder, num_processors, eval_threshold = arg_tuple
out_file = "%s%s_prot.txt" % (blast_result_folder, db.split('/')[-1].split('.')[0])
cmd = "blastall -p tblastn -a %i -i %s -d %s -e %s -o %s -m 9" % (num_processors, query_file, db, eval_threshold, out_file)
os.system( cmd )
def check_options(parsed_args):
# section of code that deals determining the number of CPU cores that will be used by the program
if parsed_args.num_proc > os.sysconf("SC_NPROCESSORS_CONF"):
num_proc = os.sysconf("SC_NPROCESSORS_CONF")
elif parsed_args.num_proc < 1:
num_proc = 1
else:
num_proc = parsed_args.num_proc
if os.path.exists(parsed_args.infile):
infile = parsed_args.infile
else:
print "The file %s does not exist." % parsed_args.infile
sys.exit()
if os.path.exists(parsed_args.query):
query = parsed_args.query
else:
print "The file %s does not exist." % parsed_args.query
sys.exit()
# if the directory that the user specifies does not exist, then the program makes it for them.
folder = parsed_args.folder
if not os.path.isdir(parsed_args.folder):
os.makedirs(parsed_args.folder)
e_val = parsed_args.eval
return infile, query, folder, num_proc, e_val
def parallel_blast(infile, query, folder, num_proc, e_val = '1e-10'):
# you kinda have to trust me here, but having blast run on as many threads per CPU as you have total processors is fastest
# I have no idea why this is... ugh.
blast_arg_list = [(i.strip(), query, folder, num_proc, e_val) for i in open(infile).readlines()]
pool = Pool(processes = num_proc)
pool.map(do_parallel_blast, blast_arg_list)
def main():
start = time.time()
parser = argparse.ArgumentParser(description="Conduct a BLAST search over a list of BLAST searchable databases using a common query file. The program will save the results in a folder designated by the user or the default './blast_result/'.")
parser.add_argument("-i", "--infile", dest="infile", default='', metavar="FILE", required=True,
help="A file that contains the path to every organism database that you are interested in.")
parser.add_argument("-f", "--folder", dest="folder", metavar="FOLDER", default='./blast_result/',
help="Folder where the BLAST results will be stored. Default is the folder './blast_result/'.")
parser.add_argument("-q", "--query", dest="query", default='./blast_database_list.txt', metavar="FILE",
help="A file that contains the BLAST query for every gene of interest in the dataset.")
parser.add_argument("-n", "--num_proc", dest="num_proc", metavar="INT", default = os.sysconf("SC_NPROCESSORS_CONF"), type=int,
help="Number of processors that you want this script to run on. The default is every CPU that the system has.")
parser.add_argument("-e", "--eval", dest="eval", default='1e-10', metavar="FLOAT",
help="eval for the BLAST search.")
parsed_args = parser.parse_args()
infile, query, folder, num_proc, e_val = check_options(parsed_args)
parallel_blast(infile, query, folder, num_proc, e_val)
print time.time() - start
# ./blast_script.py -i blast_database_list.txt -f ./blast_result/ -q ./fasta_generation/protein_matches.fa
if __name__ == '__main__':
main()