CAFA-Toolset/Verify at master · arkatebi/CAFA-Toolset · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#!/usr/bin/env python
'''
    The Benchmark Verification Tool verifies the benchmark sets created by
    Benchmark Creation Tool.

    The program takes three inputs: (1) a tab delimited annotation
    file at time t1, (2) a tab delimited annotation file at time t2,
    and (3) one of the benchmark file names created by the Benchmark
    Creation Tool using the the first and the second input files given
    here. Simplest way to run this program is:

      python Verify -I1=gene_association.goa_ref_yeast.23
                    -I2=gene_association.goa_ref_yeast.52
                    -I3=gene_association.goa_ref_yeast.52-23.benchmark_LK_bpo.1

    This tool will verify the correctness of all the benchmark files of the
    specific version as the third input that are available in the workspace.

    This tool creates some intermediate files and removes them after the
    verification is completed.

    Complete usage directions of this program can be obtained through the
    following command:

    python Verify --help
'''
import os
import sys
import shutil
import subprocess
from os.path import basename
from Bio.UniProt import GOA

import ArgParser_Benchmark as ap
import Config
import CreateBenchmark as cb
import LocateDataset as ld
import FormatChecker as fc
import GOAParser_cafa as gc
import PaperTermFrequency as ptf
import verifyBenchmark as vb

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

# Default configuration file name:
config_filename = '.cafarc'
# Default benchmark file name suffices:
bmSuffix_LK_bpo = '.benchmark_LK_bpo.'
bmSuffix_LK_cco = '.benchmark_LK_cco.'
bmSuffix_LK_mfo = '.benchmark_LK_mfo.'
bmSuffix_NK_bpo = '.benchmark_NK_bpo.'
bmSuffix_NK_cco = '.benchmark_NK_cco.'
bmSuffix_NK_mfo = '.benchmark_NK_mfo.'

class Verify:
    def __init__(self):
        # Obtain user supplied argument values in a dictionary:
        self.parsed_dict = ap.parse_args('verify')
        # Collect config file entries:
        self.ConfigParam = Config.read_config(config_filename)

        t1 = self.parsed_dict['t1'] # Retreive file name at time t1
        t2 = self.parsed_dict['t2'] # Retreive file name at time t2
        t3 = self.parsed_dict['t3'] # Retreive a benchmark file name

        # Retreive work directory name:
        self.work_dir = (self.ConfigParam['workdir']).rstrip('/')
        if not os.path.exists(self.work_dir):
            print ('Work space not found. Check ' + config_filename + \
                ' file for correct assignment of work directory.')
            sys.exit(1)
        self.t1_input_file = ld.locate_GOAfile(t1, self.work_dir)

        self.t2_input_file = ld.locate_GOAfile(t2, self.work_dir)

        # Names for SIX benchmark files: bpo, cco, and mfo
        # for LK and NK benchmark types:
        self.get_benchmark_filenames()

        # Locate the benchmark files:
        if (self.locate_benchmark_files()):
            print ('No benchmark file in the specified version is found.')
            print('Verify Program quitting ...')
            print ('****************************************************')
            sys.exit(1)

        # Names for THREE files to store non-EXP and EXP type entries:
        # (These files will be deleted once the calculation is done)

        # File name for entries in t1 file with non-EXP evidence codes:
        self.t1_iea_name = self.t1_input_file + '.iea'

        # File name for entries in t1 file with EXP evidence codes:
        self.t1_exp_name = self.t1_input_file + '.exp'

        # File name for entries in t2 file with EXP evidence codes:
        self.t2_exp_name = self.t2_input_file + '.exp'

        # Name for GO ID frequency per pubmed id for t2 file:
        # (This file will be deleted once the calculations are completed)
        self.t2_ptf_file =  self.t2_input_file + \
                            '_with_annotations_per_paper.txt'

    def get_benchmark_filenames(self):
        """
        This method initializes the benchmark file names based on the user
        input file name.
        """
        t3_basename = basename(self.parsed_dict['t3']).strip()
        bmVersion = t3_basename.split('.')[-1]
        fnPrefix = t3_basename[0:len(t3_basename)-(len(bmVersion) + \
                                                  len(bmSuffix_LK_bpo))]

        self.benchmark_LK_bpo = fnPrefix + bmSuffix_LK_bpo + bmVersion
        self.benchmark_LK_cco = fnPrefix + bmSuffix_LK_cco + bmVersion
        self.benchmark_LK_mfo = fnPrefix + bmSuffix_LK_mfo + bmVersion
        self.benchmark_NK_bpo = fnPrefix + bmSuffix_NK_bpo + bmVersion
        self.benchmark_NK_cco = fnPrefix + bmSuffix_NK_cco + bmVersion
        self.benchmark_NK_mfo = fnPrefix + bmSuffix_NK_mfo + bmVersion
        return None

    def create_iterator(self, infile):
        """
        This method creates an iterator object for the input UniProt-GOA file
        and returns it along with a list of all field names contained in the
        UniProt-GOA file. The UniProt-GOA file can either be in GAF 1.0 or
        GAF 2.0 file format.
        """
        infile_handle = open(infile, 'r')
        iter_handle = GOA.gafiterator(infile_handle)
        for ingen in iter_handle:
            if len(ingen) == 17:
                GAFFIELDS = GOA.GAF20FIELDS
                break
            else:
                GAFFIELDS = GOA.GAF10FIELDS
                break
        infile_handle = open(infile, 'r')
        iter_handle = GOA.gafiterator(infile_handle)
        return iter_handle, GAFFIELDS

    def locate_benchmark_files(self):
        """
        This method tries to find all the benchmark files. If no benchmark
        file is found, it returns True, otherwise it returns False.
        """
        noneFound = True
        if(ld.locate_benchmark_file(self.benchmark_LK_bpo, self.work_dir)):
            noneFound = False

        if(ld.locate_benchmark_file(self.benchmark_LK_cco, self.work_dir)):
            noneFound = False

        if(ld.locate_benchmark_file(self.benchmark_LK_mfo, self.work_dir)):
            noneFound = False

        if(ld.locate_benchmark_file(self.benchmark_NK_bpo, self.work_dir)):
            noneFound = False

        if(ld.locate_benchmark_file(self.benchmark_NK_cco, self.work_dir)):
            noneFound = False

        if(ld.locate_benchmark_file(self.benchmark_NK_mfo, self.work_dir)):
            noneFound = False

        return noneFound

    def create_intermediate_files(self):
        """
        This method creates all the necessary intermediate files
        that are needed to verify the benchmark sets.
        """

        # Create paper-term freq file for t2 file:
        ann_conf = ptf.paper_term_freq( open(self.t2_input_file,'r'),
                                        open(self.t2_ptf_file,'w'),
                                        self.parsed_dict)

        # Create an iterator object for filtering t2 file:
        iter_handle, GAFFIELDS = self.create_iterator(self.t2_input_file)

        # Create tax_id_name_mapping for filtering t2 file:
        tax_id_name_mapping = gc.parse_tax_file(self.ConfigParam['tax_file'])

        # Create t2_exp_name file:
        # Filter t2 file for all proteins with EXP evidence:
        print 'Parsing t2 file: ' + basename(self.t2_input_file) + ' ...'
        t2_exp_handle = open(self.t2_exp_name, 'w')
        for ingen in iter_handle: # Iterate through t2 file entries
            retval = gc.record_has_forBenchmark(ingen,
                                                ann_conf,
                                                self.parsed_dict,
                                                tax_id_name_mapping,
                                                self.ConfigParam['exp_eec'],
                                                GAFFIELDS)
            # If retval is TRUE, write out the record to t2.exp file:
            if retval:
                GOA.writerec(ingen, t2_exp_handle, GAFFIELDS)
        t2_exp_handle.close()

        # If t2.exp is empty, program quits:
        if os.stat(self.t2_exp_name).st_size == 0:
            print('Your benchmark set will be empty with the ' + \
                  'parameters provided. Quiting ...')
            sys.exit(1)

        # Create t1.iea_name and t1.exp_name files:
        # Filter t1 file to create t1.iea and t1.exp files
        iter_handle, GAFFIELDS = self.create_iterator(self.t1_input_file)
        print 'Parsing t1 file: ' + basename(self.t1_input_file) + ' ...'
        gc.t1_filter(iter_handle, self.t1_iea_name, self.t1_exp_name,
                    self.t2_exp_name, GAFFIELDS, self.ConfigParam['exp_eec'])
        return None

    def delete_intermediate_files(self):
        """
        This method deletes all the intermediate files created previously.
        It also deletes any empty files found in the workspace.
        """
        print 'Cleaning working directory ...'
        # Delete t1.iea, t1.exp, and t2.exp files:
        os.remove(self.t1_iea_name)
        os.remove(self.t1_exp_name)
        os.remove(self.t2_exp_name)
        # Delete paper term frequency file for t2 file:
        os.remove(self.t2_ptf_file)
        # Delete any empty files from the workspace (subdirectories included):
        for root, dirs, files in os.walk(self.work_dir):
            for fname in files:
                if os.path.getsize(root + '/' + fname) == 0:
                    os.remove(root + '/' + fname)
            break
        return None

    def check_gaf_format(self, goa_fname):
        """
        This method exits the Benchmark program on any of the
        following conditions:
            Case 1: if the file is empty
            Case 2: if the file is NOT in GAF format. To check this
                    it invokes check_gaf_format method of
                    FormatChecker module.
        """
        if os.stat(goa_fname).st_size == 0:
            print bcolors.WARNING + 'You submitted an empty file: ' + goa_fname + \
                  bcolors.ENDC
            sys.exit(1)
        elif not fc.check_gaf_format(open(goa_fname, 'r')):
            print bcolors.WARNING + 'File format error: ' + \
                  basename(goa_fname) + bcolors.ENDC
            print bcolors.WARNING + 'File must be in GAF 1.0 or GAF 2.0 ' + \
                'format' + bcolors.ENDC
            sys.exit(1)

    def print_prolog(self):
        print '*************************************************'
        print 'Running Benchmark Verification Tool !!!!!'
        print 'Following is a list of user supplied inputs:'
        for arg in self.parsed_dict:
            print arg + ': ' + str(self.parsed_dict[arg])
        print '*********************************************\n'
        return None

    def print_epilog(self):
        print bcolors.OKGREEN + \
            'Thank you for using Benchmark Verification Tool' + \
            bcolors.ENDC
        return None

    def verify_LK_benchmark(self, benchmark_filename, ontType):
        if (not os.path.exists(self.work_dir + '/' + benchmark_filename)):
           print (benchmark_filename + ':\n' + \
                  '\t\tfile does not exist.')
           return None
        if (os.stat(self.work_dir + '/' + \
            benchmark_filename).st_size != 0):
            # Checking file format
            fmt_flg = fc.check_benchmark_format(open(self.work_dir + '/' + \
                                                benchmark_filename, 'r'))
            # Checking whether benchmark creation was successful
            if (fmt_flg):
                err_msg = vb.verify_LK_benchmark(open(self.t1_iea_name, 'r'),
                                                 open(self.t1_exp_name, 'r'),
                                                 open(self.t2_exp_name, 'r'),
                                                 open(self.work_dir + '/' + \
                                                      benchmark_filename, 'r'),
                                                 ontType)
                if (not err_msg):
                    print(benchmark_filename + ':\n' + \
                          '\t\tno error in benchmark creation.')
                else:
                    print(benchmark_filename + ':\n' + err_msg)
                return None
            else:
                print(benchmark_filename + ':\n' + \
                      '\t\tfile is NOT in CORRECT format')
                return None
        else:
            print(benchmark_filename + ':\n' + \
                  '\t\tfile size is zero')
            return None

    def verify_NK_benchmark(self, benchmark_filename, ontType):
        if (not os.path.exists(self.work_dir + '/' + benchmark_filename)):
           print (benchmark_filename + ':\n' + \
                  '\t\tfile does not exist.')
           return None
        if (os.stat(self.work_dir + '/' + \
            benchmark_filename).st_size != 0):
            # Checking file format:
            fmt_flg = fc.check_benchmark_format(open(self.work_dir + '/' + \
                                                benchmark_filename, 'r'))
            # Checking whether benchmark creation was successful:
            if (fmt_flg):
                err_msg = vb.verify_NK_benchmark(open(self.t1_iea_name, 'r'),
                                                 open(self.t1_exp_name, 'r'),
                                                 open(self.t2_exp_name, 'r'),
                                                 open(self.work_dir + '/' + \
                                                      benchmark_filename, 'r'),
                                                 ontType)
                if (not err_msg):
                    print(benchmark_filename + ':\n' + \
                          '\t\tno error in benchmark creation.')
                else:
                    print(benchmark_filename + ':\n' + err_msg)
                return None
            else:
                print(benchmark_filename + ':\n' + \
                      '\t\tfile is NOT in CORRECT format')
                return None
        else:
            print(benchmark_filename + ':\n' + \
                  '\t\tfile size is zero')
            return None

    def process_data(self):
        """
        This method is the entry point of Benchmark Verification Tool.
        It invokes other methods for
            (1) file format checking
            (2) creating intermediate files
            (3) verifying benchmark sets
            (4) deleting intermediate files
        """

        # Print the welcome message and user argument list:
        self.print_prolog()

        # File format check for t1 file:
        self.check_gaf_format(self.t1_input_file)
        # File format check for t2 file:
        self.check_gaf_format(self.t2_input_file)

        # Create necessary intermediate files:
        self.create_intermediate_files()

        # Verifying benchmark sets:
        print 'Verifying benchmark sets ...'
        # Verify LK-benchmark sets:
        self.verify_LK_benchmark(self.benchmark_LK_bpo, 'BPO')
        self.verify_LK_benchmark(self.benchmark_LK_cco, 'CCO')
        self.verify_LK_benchmark(self.benchmark_LK_mfo, 'MFO')

        # Verify NK-benchmark sets:
        self.verify_NK_benchmark(self.benchmark_NK_bpo, 'BPO')
        self.verify_NK_benchmark(self.benchmark_NK_cco, 'CCO')
        self.verify_NK_benchmark(self.benchmark_NK_mfo, 'MFO')

        # Delete intermediate files:
        self.delete_intermediate_files()

        # Print summary of running this program:
        self.print_epilog()
        return None

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print(sys.argv[0] + ':')
        print(__doc__)
    else:
        # Create an instance of Verify class
        vm = Verify()
        # Process data and verify the benchmark sets
        vm.process_data()
    sys.exit(0)