CAFA-Toolset/Filter at master · arkatebi/CAFA-Toolset · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
'''
    Filter program accepts the following three inputs:
           (1) a UniProtKB/swissProt file
           (2) a taxon id and
           (3) an optional output file name

    When the output file name is NOT given, it will construct an output file
    name by combining a portion of the UniProtKB/SwissProt file name and the
    taxon id supplied by the user. It extracts the protein sequences for the
    species identified by the taxon id and writes them to the output file.

    How to run this program:
        For some input file uniprot_sprot.dat.2014_09 and taxon id 559292

       > python Filter -I1=uniprot_sprot.dat.2014_09 -G=55929

    Two output files will be created:
        A target sequence file: uniprot_sprot.dat.2014_09.559292.tfa.1
            This will contain the target sequences specific to the organism
            55929.
        A target id and protein name mapping file:
            uniprot_sprot.dat.2014_09.559292.tfa.1.map
            This will contain the mapping of target sequence ids and protein
            names.
'''
import os
import sys
from os.path import basename

import ConfigParser as cp
import shutil
import subprocess

import ArgParser_Filter as ap
import Config
import Filter_sp_targets as ft
import FormatChecker as fc
import LocateDataset as ld

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# Default configuration file name:
config_filename = '.cafarc'

class Filter:
    def __init__(self):
        # Collect user arguments into a dictionary:
        self.parsed_dict = ap.parse_args()

        # Collect config file entries:
        self.ConfigParam = Config.read_config(config_filename)
        self.work_dir = self.ConfigParam['workdir']

        # Look for workspace, and if none exists create one:
        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir) # Create work space

        t1 = self.parsed_dict['t1'] # Extract input file name
        # Locate the input file:
        self.t1_input_file = ld.locate_SwissProtfile(t1, self.work_dir)
        # Create output file name for target sequences:
        self.output_filename = self.create_outfilename()

        # Create output file name for mapping between SwissProt
        # protein name and sequence id in the target sequence
        # file:
        self.output_map_filename = self.output_filename + '.map'
        return None

    def create_iterator(self, infile):
        """
        Creates and returns an iterator object for an input a UniProt-GOA
        file along with a list of all field names contained in the
        UniProt-GOA file.
        """
        infile_handle = open(infile, 'r')
        iter_handle = GOA.gafiterator(infile_handle)
        for ingen in iter_handle:
            if len(ingen) == 17:
                GAFFIELDS = GOA.GAF20FIELDS
                break
            else:
                GAFFIELDS = GOA.GAF10FIELDS
                break
        infile_handle = open(infile, 'r')
        iter_handle = GOA.gafiterator(infile_handle)
        return iter_handle, GAFFIELDS

    def create_outfilename(self):
        """
        Creates an output filename based on the output file prefix
        provided by the user and at the end returns the newly
        created output filename.
        """
        if not self.parsed_dict['outfile'] == '':
            ob = basename(self.parsed_dict['outfile'])
        else:
            ob = basename(self.parsed_dict['t1']) + '.%s.tfa' \
                          % basename(self.parsed_dict['g'])
            # output file name is constructed by appending '.taxon id.tfa'
            # as extension
        index = 1
        while os.path.exists(self.work_dir + '/' + ob + '.' + str(index)):
            index = index + 1
        output_filename = self.work_dir + '/' + ob + '.' + str(index)
        return output_filename

    def print_prolog(self):
        print ("*************************************************")
        print ("Running Target Generatin Tool !!!!!")
        print ('Following is a list of user supplied inputs:')
        for arg in self.parsed_dict:
            print (arg + ': ' + str(self.parsed_dict[arg]))
        print ('*********************************************\n')
        return None

    def print_epilog(self, target_count):
        if(os.path.exists(self.output_filename) or
           os.path.exists(self.output_map_filename)):
           print(bcolors.OKGREEN + 'The following output files are created: ' +
                 bcolors.ENDC)
           if os.path.exists(self.output_filename):
                print('    Target sequence file: ' + \
                       basename(self.output_filename))
           if os.path.exists(self.output_map_filename):
                print('    Target id to protein name map file: ' + \
                       basename(self.output_map_filename))
           print('    Note: ' + str(target_count) + \
                 ' target sequences generated')
        else:
            print(bcolors.WARNING + 'No output file is created with the ' + \
                   'given input parameters' + bcolors.ENDC)
        print(bcolors.OKGREEN + 'Thank you for using Target Generation Tool' + \
               bcolors.ENDC)
        return None

    def check_sprot_format(self, sprot_fname):
        """
        This method exits the Filter program on any of the
        following conditions:
            Case 1: if the file is empty
            Case 2: if the file is NOT in UniProtKB/SwissProt format.
                    To check this it invokes check_sprot_format method
                    of FormatChecker module.
        """
        if os.stat(sprot_fname).st_size == 0:
            print bcolors.WARNING + 'You submitted an empty file: ' + sprot_fname + \
                  bcolors.ENDC
            sys.exit(1)
        elif not fc.check_sprot_format(open(sprot_fname, 'r')):
            print bcolors.WARNING + 'File format error: ' + \
                  basename(sprot_fname) + bcolors.ENDC
            print bcolors.WARNING + 'File must be in UniProtKB/SwissProt ' + \
                'format' + bcolors.ENDC
            sys.exit(1)

    def process_data(self):
        """
        This method invokes other methods to perform all tasks related
        to target generation.
        """
        # Print the wellcome message:
        self.print_prolog()

        # Check UniProtKB/SwissProt file format:
        self.check_sprot_format(self.t1_input_file)

        # Filter out the target sequences from the UniProtKB/SwissProt file:
        print('Filtering sequences from ' + \
               basename(self.t1_input_file) + ' ...')

        target_count = ft.species_filter(open(self.t1_input_file, 'r'),
                                              self.parsed_dict['g'],
                                         open(self.output_filename, 'w'),
                                         open(self.output_map_filename, 'w'),
                                              self.ConfigParam['exp_eec'])

#        seqCount, seqCount_no_exp = ft.species_filter_count(
#                                       open(self.t1_input_file, 'r'),
#                                            self.parsed_dict['g'],
#                                            self.ConfigParam['exp_eec'])

        # Print the summary of running this program:
        self.print_epilog(target_count)
        return None

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print (sys.argv[0] + ':')
        print(__doc__)
    else:
        fd = Filter()     # Create an instance of Filter class
        fd.process_data() # Process data and filter out target sequences
    sys.exit(0)