-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathFilter
More file actions
executable file
·203 lines (178 loc) · 7.67 KB
/
Filter
File metadata and controls
executable file
·203 lines (178 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
'''
Filter program accepts the following three inputs:
(1) a UniProtKB/swissProt file
(2) a taxon id and
(3) an optional output file name
When the output file name is NOT given, it will construct an output file
name by combining a portion of the UniProtKB/SwissProt file name and the
taxon id supplied by the user. It extracts the protein sequences for the
species identified by the taxon id and writes them to the output file.
How to run this program:
For some input file uniprot_sprot.dat.2014_09 and taxon id 559292
> python Filter -I1=uniprot_sprot.dat.2014_09 -G=55929
Two output files will be created:
A target sequence file: uniprot_sprot.dat.2014_09.559292.tfa.1
This will contain the target sequences specific to the organism
55929.
A target id and protein name mapping file:
uniprot_sprot.dat.2014_09.559292.tfa.1.map
This will contain the mapping of target sequence ids and protein
names.
'''
import os
import sys
from os.path import basename
import ConfigParser as cp
import shutil
import subprocess
import ArgParser_Filter as ap
import Config
import Filter_sp_targets as ft
import FormatChecker as fc
import LocateDataset as ld
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
# Default configuration file name:
config_filename = '.cafarc'
class Filter:
def __init__(self):
# Collect user arguments into a dictionary:
self.parsed_dict = ap.parse_args()
# Collect config file entries:
self.ConfigParam = Config.read_config(config_filename)
self.work_dir = self.ConfigParam['workdir']
# Look for workspace, and if none exists create one:
if not os.path.exists(self.work_dir):
os.makedirs(self.work_dir) # Create work space
t1 = self.parsed_dict['t1'] # Extract input file name
# Locate the input file:
self.t1_input_file = ld.locate_SwissProtfile(t1, self.work_dir)
# Create output file name for target sequences:
self.output_filename = self.create_outfilename()
# Create output file name for mapping between SwissProt
# protein name and sequence id in the target sequence
# file:
self.output_map_filename = self.output_filename + '.map'
return None
def create_iterator(self, infile):
"""
Creates and returns an iterator object for an input a UniProt-GOA
file along with a list of all field names contained in the
UniProt-GOA file.
"""
infile_handle = open(infile, 'r')
iter_handle = GOA.gafiterator(infile_handle)
for ingen in iter_handle:
if len(ingen) == 17:
GAFFIELDS = GOA.GAF20FIELDS
break
else:
GAFFIELDS = GOA.GAF10FIELDS
break
infile_handle = open(infile, 'r')
iter_handle = GOA.gafiterator(infile_handle)
return iter_handle, GAFFIELDS
def create_outfilename(self):
"""
Creates an output filename based on the output file prefix
provided by the user and at the end returns the newly
created output filename.
"""
if not self.parsed_dict['outfile'] == '':
ob = basename(self.parsed_dict['outfile'])
else:
ob = basename(self.parsed_dict['t1']) + '.%s.tfa' \
% basename(self.parsed_dict['g'])
# output file name is constructed by appending '.taxon id.tfa'
# as extension
index = 1
while os.path.exists(self.work_dir + '/' + ob + '.' + str(index)):
index = index + 1
output_filename = self.work_dir + '/' + ob + '.' + str(index)
return output_filename
def print_prolog(self):
print ("*************************************************")
print ("Running Target Generatin Tool !!!!!")
print ('Following is a list of user supplied inputs:')
for arg in self.parsed_dict:
print (arg + ': ' + str(self.parsed_dict[arg]))
print ('*********************************************\n')
return None
def print_epilog(self, target_count):
if(os.path.exists(self.output_filename) or
os.path.exists(self.output_map_filename)):
print(bcolors.OKGREEN + 'The following output files are created: ' +
bcolors.ENDC)
if os.path.exists(self.output_filename):
print(' Target sequence file: ' + \
basename(self.output_filename))
if os.path.exists(self.output_map_filename):
print(' Target id to protein name map file: ' + \
basename(self.output_map_filename))
print(' Note: ' + str(target_count) + \
' target sequences generated')
else:
print(bcolors.WARNING + 'No output file is created with the ' + \
'given input parameters' + bcolors.ENDC)
print(bcolors.OKGREEN + 'Thank you for using Target Generation Tool' + \
bcolors.ENDC)
return None
def check_sprot_format(self, sprot_fname):
"""
This method exits the Filter program on any of the
following conditions:
Case 1: if the file is empty
Case 2: if the file is NOT in UniProtKB/SwissProt format.
To check this it invokes check_sprot_format method
of FormatChecker module.
"""
if os.stat(sprot_fname).st_size == 0:
print bcolors.WARNING + 'You submitted an empty file: ' + sprot_fname + \
bcolors.ENDC
sys.exit(1)
elif not fc.check_sprot_format(open(sprot_fname, 'r')):
print bcolors.WARNING + 'File format error: ' + \
basename(sprot_fname) + bcolors.ENDC
print bcolors.WARNING + 'File must be in UniProtKB/SwissProt ' + \
'format' + bcolors.ENDC
sys.exit(1)
def process_data(self):
"""
This method invokes other methods to perform all tasks related
to target generation.
"""
# Print the wellcome message:
self.print_prolog()
# Check UniProtKB/SwissProt file format:
self.check_sprot_format(self.t1_input_file)
# Filter out the target sequences from the UniProtKB/SwissProt file:
print('Filtering sequences from ' + \
basename(self.t1_input_file) + ' ...')
target_count = ft.species_filter(open(self.t1_input_file, 'r'),
self.parsed_dict['g'],
open(self.output_filename, 'w'),
open(self.output_map_filename, 'w'),
self.ConfigParam['exp_eec'])
# seqCount, seqCount_no_exp = ft.species_filter_count(
# open(self.t1_input_file, 'r'),
# self.parsed_dict['g'],
# self.ConfigParam['exp_eec'])
# Print the summary of running this program:
self.print_epilog(target_count)
return None
if __name__ == '__main__':
if len(sys.argv) == 1:
print (sys.argv[0] + ':')
print(__doc__)
else:
fd = Filter() # Create an instance of Filter class
fd.process_data() # Process data and filter out target sequences
sys.exit(0)