-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
124 lines (99 loc) · 3.37 KB
/
utils.py
File metadata and controls
124 lines (99 loc) · 3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Utils for COMP 5970 Project 5
import json
import os
# Get the parent directory of this code
this_script = os.path.abspath(__file__)
parent_directory = os.path.dirname(this_script)
def read_fasta(file_path, dir=None):
"""
Reads a fasta sequence
:return: A fasta sequence as a string, in upper case
"""
if file_path is None:
return None
sequence = ''
if dir:
file_path = os.path.join(dir, file_path)
with open(file_path, 'r') as f:
# Ignore the title line
title = f.readline()
for line in f:
sequence += line.strip()
return sequence.upper()
def read_directory_contents(path, file_extension):
"""
Lists all files ending with the file_extension in the directory path
:return: list of file names
"""
if not os.path.isdir(path):
# This is not a valid directory!
raise Exception('Not a valid directory!')
# Return a list of files with the file extension
ls_dir = os.listdir(path)
return [file_name for file_name in ls_dir if file_name.endswith(file_extension)]
def read_pssm(file_path, dir=None):
"""
Reads in the right half of a PSSM file
:return: list of dictionaries, where each dict is a row of the matrix.
Keys are amino acids, values are the value in the row of the matrix
"""
if dir:
file_path = os.path.join(dir, file_path)
pssm = []
with open(file_path, 'r') as f:
# Ignore the title line
title = f.readline()
if title in ['', '\n']:
title = f.readline()
# Get the list of amino acids on the top
headers = f.readline().strip().split()[20:]
# Now, read each line of the matrix into a dictionary
for line in f:
if line in ['', '\n']:
break
line_list = line.strip().split()
row = {'this-acid': line_list[1]}
line_list = line_list[20:-2]
for acid_num in range(len(headers)):
row[headers[acid_num]] = int(line_list[acid_num + 2])
pssm.append(row)
# Returns a list of dictionaries, where each dict is a row of the matrix
return pssm
def read_tmalign(file_path, dir=None):
"""
Reads in a tmalign file
:return: Dictionary with sequence names and TM scores
"""
if dir:
file_path = os.path.join(dir, file_path)
with open(file_path, 'r') as f:
# Skip the header
line = f.readline()
while not line.startswith('TM-score'):
line = f.readline()
tm_score_1 = get_tm_score_from_tmalign(line)
line = f.readline()
tm_score_2 = get_tm_score_from_tmalign(line)
return (tm_score_1 + tm_score_2) / 2.0
def get_tm_score_from_tmalign(line):
line_parts = line.split()
return float(line_parts[1])
def write_model(model, file_name='model.json', dir=parent_directory):
"""
Write JSON object to a file
:return: None
"""
if dir:
file_name = os.path.join(dir, file_name)
with open(file_name, 'w') as outfile:
json.dump(model, outfile)
def read_model(file_name='model.json', dir=parent_directory):
"""
Reads in the JSON object from model.json
:return: Loaded contents of model file
"""
if dir:
file_name = os.path.join(dir, file_name)
with open(file_name, 'r') as file:
model = json.load(file)
return model