CAFA-Toolset/testBenchmark at master · arkatebi/CAFA-Toolset · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python

'''
    The testBenchmark program runs the Benchmark and Verify twin programs
    repeatedly on a set of UniProt-GOA annoation file pairs.

    How to run this program:

       python testBenchmark -I=dataset-testbm.txt -V=1 -O=output-testbm.txt

    dataset-testbm.txt lists the pairs of UniProt-GOA annotatation file names
        at time points t1 and t2.
    -V1=1 tells testBenchmark to execute the Verify Program on benchmark sets of
        version 1.
    output-testbm.txt will have all the messages coming from running
        Benchmark and Verify programs.
'''

import os
import sys
import subprocess
from os.path import basename
import urllib2

import ArgParser_testBenchmark as ap
import Config

config_filename = '.cafarc' # Default configuration file name

class testBenchmark:
    def __init__(self):
        self.parsed_dict = ap.parse_args()
        self.ConfigParam = Config.read_config(config_filename)
        self.work_dir = (self.ConfigParam['workdir'].rstrip('/'))

        self.testData_filename = self.work_dir + '/' + basename(self.parsed_dict['input1'])
        self.output_filename = self.work_dir + '/' + basename(self.parsed_dict['output1'])
        self.bmVersion = self.parsed_dict['version'] # The benchmark verion number that will be verified

        self.goa_arc = 'ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/old'

        if not os.path.exists(self.work_dir):
            os.makedirs(self.work_dir)
                # Create work direcoty, if it does not exist

    def download_testDataset(self, testDataset_fh):
        for line in testDataset_fh:
            if (not os.path.isfile(self.work_dir + '/' + line.strip())):
                # Organism specific folder name at UniProt-GOA archive:
                folder_name = line.strip().split('.')[1].split('_')[-1]
                # Organism specific archived file name at UniProt-GOA archive:
                fname = line.strip() + '.gz'
                # Organism specific URL at UniProt-GOA archive:
                url = self.goa_arc + '/' + folder_name.upper()
                # Download the file from the UniProt-GOA archive:
                print('Downloading ' + fname + ' ... ')
                if (not self.download(url, fname)):
                    print('Downloading failed for ' + fname)
        return None

    def download(self, url, fname):
        try:
            response = urllib2.urlopen(url + '/' + fname)
        except urllib2.HTTPError, err:
            return False
        except urllib2.URLError, err:
            return False
        out_fh = open(self.work_dir + '/' + fname, 'w')
        out_fh.write(response.read())
        out_fh.close()
        os.system('gzip -d ' + self.work_dir + '/' + fname)
        return True

    def exec_twinToolset(self, input1, input2):
        # Create benchmark files:
        cmd_benchmark = 'python' + ' ' + 'Benchmark' + ' ' + '-I1=' + \
                        input1 + ' ' + '-I2=' + input2 + ' >> ' + \
                        self.output_filename
        fh = open(self.output_filename, 'a')
        fh.write('\n' + cmd_benchmark + '\n')
        fh.close()
        subprocess.call(cmd_benchmark, shell=True)

        # Verify the benchmark files that are just created:
        input3 = basename(input2) + '-' + (basename(input1).split('.'))[-1] + \
                 '.benchmark_LK_bpo.' + str(self.bmVersion)
        cmd_verify = 'python' + ' ' + 'Verify' + ' ' + '-I1=' + input1 + ' ' + \
                     '-I2=' + input2 + ' ' + '-I3=' + input3 + ' >> ' + \
                     self.output_filename
        fh = open(self.output_filename, 'a')
        fh.write('\n' + cmd_verify + '\n')
        fh.close()
        subprocess.call(cmd_verify, shell=True)
        return None

    def run_test(self, testDataset_fh):
        for line in testDataset_fh:
            input1 = self.work_dir + '/' + line.strip()
            input2 = self.work_dir + '/' + (next(testDataset_fh)).strip()
            self.exec_twinToolset(input1, input2)
        return None

    def process_test(self):
        self.download_testDataset(open(self.testData_filename, 'r'))
        subprocess.call('echo ' +  'This is the output from running ' + \
                        'testBenchmark program' + ' ' + \
                        '>' + self.output_filename, shell=True)
        self.run_test(open(self.testData_filename, 'r'))
        subprocess.call('echo ' +  'End of running testBenchmark ' + \
                        'program' + ' ' + '>>' + \
                         self.output_filename, shell=True)
        return None

if __name__ == '__main__':
    if len(sys.argv) == 1:
        print (sys.argv[0] + ':')
        print(__doc__)
        sys.exit(0)
    else:
        testObject = testBenchmark()
        testObject.process_test()
        sys.exit(0)