-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathUNIPROT_parseIsoforms.py
More file actions
79 lines (53 loc) · 2.26 KB
/
UNIPROT_parseIsoforms.py
File metadata and controls
79 lines (53 loc) · 2.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Parse UNIPROT FASTA file and load sequence data
# into the database
import Config
import sys, string
import MySQLdb
import Database
import re
from gzip import open as gopen
from classes import UniprotKB
descRE = re.compile( '^([A-Z0-9]+_{1}[A-Z0-9]+) (.*?) (OS=.*?)? (GN=(.*?))?$', re.VERBOSE )
with Database.db as cursor :
cursor.execute( "UPDATE " + Config.DB_NAME + ".uniprot_isoforms SET uniprot_isoform_status='inactive'" )
Database.db.commit( )
uniprotKB = UniprotKB.UniprotKB( Database.db, cursor )
accessionHash = uniprotKB.buildAccessionHash( )
organismHash = uniprotKB.buildOrganismHash( )
with gopen( Config.UP_ISOFORMS ) as file :
currentInfo = { }
for line in file.readlines( ) :
line = line.strip( )
# Skip Blank Lines
if len( line ) <= 0 :
continue
if ">" == line[0] :
if len( currentInfo ) > 0 :
if currentInfo["ACCESSION"] in accessionHash :
uniprotID = accessionHash[currentInfo["ACCESSION"]]
organismID = organismHash[uniprotID]
uniprotKB.processIsoform( uniprotID, organismID, currentInfo )
currentInfo = { }
splitHeader = line.split( "|" )
splitAccession = splitHeader[1].split( "-" )
currentInfo["ACCESSION"] = splitAccession[0].strip( )
currentInfo["ISOFORM"] = splitAccession[1].strip( )
descMatches = re.search( descRE, splitHeader[2] )
currentInfo["NAME"] = (descMatches.group(1)).strip( )
currentInfo["DESC"] = (descMatches.group(2)).strip( )
if None != descMatches.group(5) :
currentInfo["GENE"] = (descMatches.group(5)).strip( )
else :
currentInfo["GENE"] = ""
currentInfo["SEQUENCE"] = []
else :
currentInfo["SEQUENCE"].append( line.upper( ).strip( ) )
# Load the last sequence from the file
if len( currentInfo ) > 0 :
if currentInfo["ACCESSION"] in accessionHash :
uniprotID = accessionHash[currentInfo["ACCESSION"]]
organismID = organismHash[uniprotID]
uniprotKB.processIsoform( uniprotID, organismID, currentInfo )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'UNIPROT_parseIsoforms', NOW( ) )" )
Database.db.commit( )
sys.exit( )