-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathREFSEQ_parseProteinFile.py
More file actions
86 lines (60 loc) · 2.88 KB
/
REFSEQ_parseProteinFile.py
File metadata and controls
86 lines (60 loc) · 2.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# Parse REFSEQ FASTA files and load sequence data
# into the database
import Config
import sys, string, argparse
import MySQLdb
import Database
import glob
from classes import Refseq
# Process Command Line Input
argParser = argparse.ArgumentParser( description = 'Parse a specific protein file.' )
argParser.add_argument( '-f', help = 'Protein File', dest = 'proteinFile', required=True, action='store' )
inputArgs = vars( argParser.parse_args( ) )
filename = inputArgs['proteinFile']
with Database.db as cursor :
refseq = Refseq.Refseq( Database.db, cursor )
accessionHash = refseq.buildAccessionMappingHash( )
organismHash = refseq.buildOrganismMappingHash( )
print "Working on File: " + filename
sequences = { }
with open( filename, "r" ) as file :
currentInfo = { }
for line in file.readlines( ) :
line = line.strip( )
# Skip Blank Lines
if len( line ) <= 0 :
continue
if ">" == line[0] :
if len( currentInfo ) > 0 :
sequences[currentInfo['ACCESSION']] = currentInfo
currentInfo = { }
splitHeader = line.split( "|" )
currentInfo["GI"] = splitHeader[1].strip( )
currentInfo["DESC"] = splitHeader[4].strip( )
currentInfo["DESC"] = currentInfo["DESC"][:currentInfo["DESC"].rfind( "[" )].strip( )
accessionFull = splitHeader[3].strip( ).split( "." )
currentInfo["ACCESSION"] = accessionFull[0]
currentInfo["VERSION"] = accessionFull[1]
currentInfo["SEQUENCE"] = []
else :
currentInfo["SEQUENCE"].append( line.upper( ).strip( ) )
# Load the last sequence from the file
if len( currentInfo ) > 0 :
sequences[currentInfo['ACCESSION']] = currentInfo
currentInfo = { }
for accession,sequenceInfo in sequences.items( ) :
accession = accession.strip( )
sequence = "".join( sequenceInfo["SEQUENCE"] )
sequenceLength = len( sequence )
taxID = organismHash[sequenceInfo["GI"]]
if accession in accessionHash :
print "UPDATING SEQUENCE"
cursor.execute( "UPDATE " + Config.DB_NAME + ".refseq SET refseq_gi=%s, refseq_sequence=%s, refseq_length=%s, refseq_description=%s, refseq_version=%s, refseq_modified=NOW( ), refseq_status='active', organism_id=%s WHERE refseq_id=%s", [sequenceInfo['GI'], sequence, sequenceLength, sequenceInfo["DESC"], sequenceInfo["VERSION"], taxID, accessionHash[accession]] )
else :
print "INSERTING NEW SEQUENCE"
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".refseq VALUES( '0', %s, %s, %s, %s, %s, %s, %s, NOW( ), 'active' )", [accession, sequenceInfo["GI"], sequence, sequenceLength, sequenceInfo["DESC"], sequenceInfo["VERSION"], taxID] )
accessionHash[accession] = cursor.lastrowid
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'REFSEQ_parseProteins', NOW( ) )" )
Database.db.commit( )
sys.exit( )