-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathp.py
More file actions
executable file
·133 lines (108 loc) · 5.1 KB
/
p.py
File metadata and controls
executable file
·133 lines (108 loc) · 5.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from multiprocessing import Pool
import numpy
import time
import sys
import os
import re
import Bio
from Bio import SeqIO,SeqFeature
from Bio.SeqRecord import SeqRecord
#import custom_db
numToFactor = 9766
DB_FOLDER = './db_folder/'
def convert_genbank(genbank_path, db_directory = DB_FOLDER):
record_list = []
seq_record = SeqIO.parse(open(genbank_path), "genbank").next()
nc_number = seq_record.name
# genbank information about the sequence found in the file.
accession = seq_record.id
comment = seq_record.annotations['comment']
data_file_division = seq_record.annotations['data_file_division'] # this is not very useful, since it is historical and does not reflect current taxonomy
date = seq_record.annotations['date'] # this should be the date that the file was updated last. uesful if i wish to autoupdate for more current versions of a genome
gi = seq_record.annotations['gi'] # i do not plan on using this, but it may be handly to have, so i will include anyway
key_words = seq_record.annotations['keywords']
organism = seq_record.annotations['organism'].replace(' ', '_')
sequence_version = seq_record.annotations['sequence_version'] # this is an integer
taxonomy = seq_record.annotations['taxonomy'] # this will be a list
source_folder = genbank_path.split('/')[len(genbank_path.split('/'))-2]
# this block of code determines the type of sequence [chromosome, plasmid] and if there are more than one chromosome in the organism.
# In the case of mutiple chromosomes, the code will assign a name to each, based on the information provided in the genbank file.
description = seq_record.description
if re.search("hromosome", description) is not None: # we have a segmented chromosome (more than one chromosome)
seq_type = 'chromosome'
seq_name = description.split('hromosome')[1].split(',')[0].strip()
elif re.search("lasmid", description) is not None: # we have a plasmid
seq_type = 'plasmid'
seq_name = description.split('lasmid')[1].split(',')[0].strip()
else:
seq_type = 'chromosome'
seq_name = 'complete_genome'
print 'path', genbank_path, 'source folder', source_folder, 'organism ', organism
# loop over the genbank file
for fnum, feature in enumerate(seq_record.features):
if feature.type == 'CDS':
start = feature.location._start.position
stop = feature.location._end.position
strand = feature.strand
#print 'keys = ', feature.qualifiers.keys()
try:
locus = feature.qualifiers['locus_tag'][0]
except:
locus = feature.qualifiers['gene'][0]
seq = seq_record.seq[start:stop]
if len(seq) == 0:
pass
else:
if 'gene' in feature.qualifiers:
gene = feature.qualifiers['gene'][0]
record_list.append(SeqRecord(seq, id="ref|%s|%s|%s|%s|%s|%s|%s" % (accession, locus, gene, str(start), str(stop), str(strand), organism),
description = ''))
else:
record_list.append(
SeqRecord(seq, id="ref|%s|%s|%s|%s|%s|%s|%s" % (accession, locus, 'unknown', str(start), str(stop), str(strand), organism),
description = ''))
outpath = os.path.splitext(os.path.basename(genbank_path))[0] + ".ffc"
SeqIO.write(record_list, open(db_directory + outpath,"w"), "fasta")
def isFactor(x):
result = None
div = (numToFactor / x)
if div*x == numToFactor:
result = (x,div)
return result
def testFunction(org):
seq_record = SeqIO.parse(open(org), "genbank").next()
accession = seq_record.annotations['accessions'][0]
organism = seq_record.annotations['organism'].replace(' ', '_')
for fnum, feature in enumerate(seq_record.features):
if feature.type == 'CDS':
start = feature.location._start.position
stop = feature.location._end.position
print "Accession:", accession
#this function will return all of the files that are in a directory. os.walk is recursive traversal.
def returnRecursiveDirFiles(root_dir):
result = []
for path, dir_name, flist in os.walk(root_dir):
for f in flist:
fname = os.path.join(path, f)
if os.path.isfile(fname):
result.append(fname)
return result
if __name__ == '__main__':
root_folder = './genbank_folders/'
#root_folder = './sequences/'
flist = []
flist = returnRecursiveDirFiles(root_folder)
print flist
#flist = [path + i for i in os.listdir(path)]
#print flist
start = time.time()
pool = Pool(processes=4)
#result = pool.map(testFunction, flist)
result = pool.map(convert_genbank, flist)
#possibleFactors = range(1,int(numpy.floor(numpy.sqrt(numToFactor)))+1)
#print 'Checking ', possibleFactors
#result = pool.map(isFactor, possibleFactors)
#cleaned = [x for x in result if not x is None]
#print 'Factors are', cleaned
end = time.time()
print end - start