TADs-analysis/predictions_Logo_library.py at master · aerijman/TADs-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
import os,sys,re
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#sys.path.append(os.path.abspath('../../new/'))
from predictions_library import *

myHOME = os.path.abspath('..')

start, end = "\033[1m", "\033[0;0m" # this is to print bold

def initialize_notebook_and_load_datasets():
    '''
        function initialize notebook and load datasets.
        input: nothing
        output: data
    '''
    # load datasets
    positives = open_fastas(myHOME + '/data/ARG3_O1_O2_clustered_rescored.fasta_bin14_threshold10.0_37923positives.fasta')
    negatives = open_fastas(myHOME + '/data/ARG3_O1_O2_clustered_rescored.fasta_bg_37923negatives.fasta')
    positives['TAD']=1
    negatives['TAD']=0
    data = pd.concat([pd.DataFrame(i) for i in [positives, negatives]], axis=0)

    # get aa frequencies
    for i in aa:
        data[i] = np.array([seq.count(i)/30. for seq in data.index])

    # function to extract distribution of sequence in bins
    def fasta_header_extract(fileName):
        headers, sequences = [], []
        for i in open(fileName):
            if i[0]=='>':
                tmp = i.strip().split('_')
                headers.append(tmp)
                continue
            else:
                sequences.append(i.strip())
        values = np.array(headers)[:,1:6].astype(int)
        return pd.DataFrame(values, index=sequences, columns = ['bg','bin1','bin2','bin3','bin4'])

    # from data, take out sequences with no reads in bins2,3 and 4 (bin distribution is extracted from the fasta headers)
    positives_bins = fasta_header_extract(myHOME + '/data/ARG3_O1_O2_clustered_rescored.fasta_bin14_threshold10.0_37923positives.fasta')
    negatives = open_fastas(myHOME + '/data/ARG3_O1_O2_clustered_rescored.fasta_bg_37923negatives.fasta')

    positives_out = positives_bins[(positives_bins['bin2']==0) & (positives_bins['bin3']==0) & (positives_bins['bin4']==0)].index
    data = data.loc[ set(data.index) - set(positives_out)]

    return data


def generate_split_index(data):
    '''
        function generate indices for train, test, validation split
        input: data
        output: _train, _test, _valid are indices of data, splitted 80,10,10
    '''
    idx = np.arange(data.shape[0])
    np.random.seed=0
    np.random.shuffle(idx)

    _10 = len(idx)
    _8, _9 = int(0.8*_10), int(0.9*_10)

    _train = data.index[idx[:_8]]
    _test = data.index[idx[_8:_9]]
    _valid = data.index[idx[_9:]]

    return _train, _test, _valid


def calculate_charge(seq):
    '''
        calculates the charge/residue of the sequence
    '''
    charge = seq.count('K') + seq.count('R') + seq.count('H') * 0.5 \
             - seq.count('D') - seq.count('E')

    return (charge/len(seq))


def open_fasta(fasta):
    '''
        function open fasta file
        input: fasta file
        output: sequence
    '''
    sequence = []
    for i in open(fasta):
        # obviate header
        if i[0]=='>': continue
        # else take line and later join
        sequence.append(i.strip('\n'))

    return ''.join(sequence)

def open_ss(ss):
    '''
        function open .horiz files
        input: .horiz file
        output: sequence of ss elements
    '''
    sequence = []
    for i in open(ss):
        # jump lines that are not relevant
        if i[:4]!='Pred': continue
        sequence.append(i[6:].strip('\n'))

    return ''.join(sequence).replace('C','-')


def ohe_single(sequence, ss, **kwargs): #kwargs are aa, ss_psipred_set):
    '''
        ohe a single protein
        input: protein sequence and secondary structure (psipred format). list of aa and set of ss
               AS USED WITH THE TRAINING DATA!!!
        output: one hot encoded data of the protein as input for the neural network
    '''
    # if aa and ss_psipred lists where not provided, here are defined
    if 'aa_list' not in kwargs.keys():
        print('using local aa_list')
        aa_list = ['R','H','K','D','E','S','T','N','Q','A','V','L','I','M','F' ,'Y', 'W', 'C','G','P'] # (or just aa from predictions_library)
    else:
        aa_list = kwargs['aa_list']

    if 'ss_list' not in kwargs.keys():
        print('using local ss_list')
        ss_list = ['E', 'H', '-']
    else:
        ss_list = kwargs['ss_list']

    # translate ss into ohe
    categorical_ss = np.zeros(shape=(len(ss),3))
    # go over each position
    for n,i in enumerate(ss):
        # fill 1 at the position that matches the index in ss_psipred_set
        position = ss_list.index(i)
        categorical_ss[n, position] = 1

    # translate sequence into ohe
    categorical_seq = np.zeros(shape=(len(sequence),20))
    # go over each positoin
    for n,i in enumerate(sequence):
        # fill 1 at position that matches index in aa
        position = aa.index(i)
        categorical_seq[n, position] = 1

    # return merged matrix
    return np.hstack([categorical_seq, categorical_ss]).reshape(1, len(ss), 23, 1)


def translate_dna2aa(dna):
    '''
        function translate nucleotide sequence into aminoacid sequence
        input: dna sequence as a string
        output: aa sequence as a string
    '''
    # codons list
    gene_code = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'}

    codon_seq = [dna[i:i+3] for i in np.arange(0,len(dna),3)]
    aa_seq = [gene_code[i] for i in codon_seq]

    return ''.join(aa_seq)


def read_wig(wig):
    '''
        function read wiggle file
        input: wiggle file
        output: tuple with position as key and counts as values
    '''
    positions, counts = [],[]
    f = open(wig)
    try:
        while True:
            line = next(f)
            _, position, count = line.strip('\n').split('\t')
            positions.append(position)
            counts.append(count)

    except StopIteration:
        print('done! ' + wig + ' file loaded')

    return pd.DataFrame(np.array(counts, 'float'), index=np.array(positions, 'int'), columns=['counts'])


def make_m_dict(Stark_data_annot, gfp_p, gfp_m, FOLD_CHANGE_THRESHOLD=2, MINIMUM_READS=100):
    '''
        function generate a table of measurements from Stark's (or other) data
        INPUT: Stark_data_annot (including gene name, start and end positions in gfp_p and gfp_m)
               gfp_p: wig file with positions and counts for sorting GFP positive
               gfp_m: wig file with positions and counts for sorting GFP negative
               FOLD_CHANGE_THRESHOLD: minimal fold change gfp_p/gfp_m accepted to generate data
               MINIMUM_READS: minimum absolute counts in gfp_p accepted to generate data
        OUTPUT: table (key: gene name
                       values: counts over the length of each protein)
    '''
    k,v=[],[]
    for TF_name in Stark_data_annot.index:

        # extract gene values from plus and minus sets
        start, end = Stark_data_annot.loc[TF_name, ['start','end']]
        plus = gfp_p.iloc[start:end].counts.values
        minus = gfp_m.iloc[start:end].counts.values

        # averaged values every 3 nt to plot together with protein
        plus = np.array([np.mean(plus[i:i+3]) for i in np.arange(0, len(plus), 3)])
        minus = np.array([np.mean(minus[i:i+3]) for i in np.arange(0, len(minus), 3)])

        # take values of tADs whose plus/minus >fold_change_threshold and plus >minimum_reads counts
        tAD = np.nan_to_num(plus/minus)
        tAD = np.array([k if k>FOLD_CHANGE_THRESHOLD and i>MINIMUM_READS else 0 for i,k in zip(plus, tAD)])
        tAD = np.nan_to_num((tAD - np.min(tAD)) / (np.max(tAD) - np.min(tAD)))

        k.append(TF_name)
        v.append(tAD)

    # finally define m_dict, a dictionary for m
    m_dict = dict(zip(k,v))

    return m_dict

def make_p_dict(Stark_data_annot, deep_model, fastas_folder='../data/Stark_tADs/fastas/', horiz_folder='../data/Stark_tADs/horiz/'):
    '''
    function generate table (dictionary) of key=f, values=predictions from best NN model
    INPUT: Stark_data_annot (I could use f instead, but I keep this to make it equal to make_m_dict)
           fastas_folder directory
           horiz_folder directory
    OUTPUT:table (keys=gene names, values=prediction scores over the length of each protein)
    '''
    k,v = [],[]
    for TF_name in Stark_data_annot.index:

        # if ss file is not here yet, skip
        if not os.path.exists(horiz_folder+ TF_name + '.horiz'): continue

        # open fasta and horiz files and generate ohe
        seq = open_fasta(fastas_folder+ TF_name + '.fasta')
        ss = open_ss(horiz_folder+ TF_name + '.horiz')
        single = ohe_single(seq,ss, aa_list=aa, ss_list=ss_psipred_set)

        # predict using deep_model
        predictions = []
        for i in range(0, len(ss)-30):
            region = deep_model.predict(single[0, i:i+30, :, 0].reshape(1,30,23,1))[0][0]
            predictions.append(region)

        k.append(TF_name)
        v.append(np.array(predictions))

    # finally define p_dict, a dictionary for p
    p_dict = dict(zip(k,v))

    return p_dict


def build_distribution(f, p_dict, n_iter=10000):
    '''
        function build distribution of correlation coefficients of two vectors
        as one of them is permutated in each iteration.
        INPUT: f (index of genes from table)
               p_dict (table. keys=gene names (as in f), values=(counts over the length of the protein))
               m = vector of concatenated counts from m_dict
        OUTPUT: list of correlation coefficients.
    '''
    # build distribution of correlations
    corr_distrib = []
    for n in range(n_iter):

        # permutation of p
        k = f[:]
        np.random.shuffle(k)
        p_permut = np.concatenate([p_dict[i] for i in k])

        # compute correlation
        corr_distrib.append(np.corrcoef(m, p_permut)[0][1])

    return corr_distrib


def calc_typeI_error(corr_values, point, **kwargs):
    '''
        function makes distribution out of correlation values (corr_values)
        and calculate the area under "normal" curve as extreme or more extreme than point,
        that is the probability that a number falls on the right of the point in that curve(
        if point is positive, else to the left of the point).
        INPUT = corr_values (list of values of correlation) and point (single sample to test)
        optional = bins (bins=100) and plot (plot=True)
        OUTPUT = type I error or area under the curve from point to the right or
                 point to the left if point is negative value.
    '''
    # allow user to input bins number
    if 'bins' in kwargs.keys(): bins=kwargs['bins']
    else: bins=100

    # make histogram of corr_values
    y,x = np.histogram(corr_values, bins=bins)

    # have to make x.shape=y.shape
    x = x[:-1]

    # allow user to plot the distribution
    if 'plot' in kwargs.keys():
        if kwargs['plot']==True:
            yc = np.convolve(y, np.ones(10)/10, "same")
            if point>0:
                _right = np.where(x>=point)
                _left = np.where(x<point)
                plt.plot(x, yc, c="k")
                plt.fill_between(x[_right], yc[_right], color='g', alpha=0.4)
                plt.fill_between(x[_left], yc[_left], color='b', alpha=0.3)
            else:
                _left = np.where(x<=point)
                _right = np.where(x>point)
                plt.plot(x, yc, c="k")
                plt.fill_between(x[_right], yc[_right], color='b', alpha=0.3)
                plt.fill_between(x[_left], yc[_left], color='g', alpha=0.4)

    # get coordinates of "values as or more extreme than point"
    if point<0:
        idx = np.where(x<=point)
    else:
        idx = np.where(x>=point)

    # measure total area and area from point to it's right
    total_area = np.sum(y)
    area_type_I_error = np.sum(y[idx])

    # typeI error
    probaTypeI = area_type_I_error *1. / total_area

    return probaTypeI


# create matrix to store results. This matrix should contain for each position, all possible 20 aa with their
# corresponding tAD probabilities. Dims = [seq_position, aa.index] = predictions
def compute_mutagenesis(ohe_data, refId, deep_model):
    all_samples = ohe_data.shape[0]
    #results = np.zeros(shape=(all_samples,30,20))
    results = np.zeros(shape=(30,20))

    # go over all samples and mutate every position in a ohe setting.
    #sample = ohe_data[bestIdx[0]]#ohe_data[10].copy()
    sample = ohe_data[refId]

    # first of all measure tAD probaboility in the original sequence
    prediction = deep_model.predict(sample.reshape(np.append(1, sample.shape)))
    print('original_prediction: {}'.format(prediction[0][0]))
    # list of amino acids in ohe format
    original_seq = np.where(sample[:,:20,0]==1)[1] #ohe_data[0,:,:20,0]==1)[1]

    # start filling results with original_seq
    for n in range(30):
        results[n,original_seq[n]]=prediction[0][0]

    # go over all positions in the sequence
    for position in range(30):  #len(original_seq)
        # ohe_aminoacid in current position
        original_position = original_seq[position]
        # list all possible ohe aminoacids
        ohe_positions = list(np.arange(20))
        # remove the original aa from the list
        ohe_positions.remove(original_position)

        # copy into new instance to avoid overwriting and to restore all other positions to their
        # original values
        this_sample = sample.copy()
        # start mutation in that position
        for mutation in ohe_positions:
            # reset all aa in position to 0
            this_sample[position,:20,0]=0
            # make mutation
            this_sample[position,mutation,0]=1
            # predict the mutant tAD probability
            tmp = deep_model.predict(this_sample.reshape(np.append(1, sample.shape)))

            # If there is a radical change, let me know
            if prediction > 0.5 and tmp[0][0] < 0.5 or prediction < 0.5 and tmp[0][0] > 0.5:
                # print the original sequence
                SEQUENCE = ''.join([aa[S] for S in original_seq])
                print(SEQUENCE)
                # print which mutation at which position
                print('{}/{} at position {} -> score {} into {}'.format(aa[original_position], aa[mutation], position, prediction, tmp[0][0]))
                #print(position, aa.index(mutation))

            #results[sample_number, position, mutation] = prediction
            #result.append([aa[mutation], aa[original_position]])
            results[position,mutation]=tmp[0][0]

    return prediction, results