-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEG_parseAliases.py
More file actions
63 lines (45 loc) · 2.12 KB
/
EG_parseAliases.py
File metadata and controls
63 lines (45 loc) · 2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Parse all aliases for genes from Entrez Gene that
# are relevant to the organisms we want loaded
# via the organisms table.
import Config
import sys, string
import MySQLdb
import Database
import gzip
from classes import EntrezGene
with Database.db as cursor :
entrezGene = EntrezGene.EntrezGene( Database.db, cursor )
existingEntrezGeneIDs = entrezGene.fetchExistingEntrezGeneIDs( )
cursor.execute( "TRUNCATE TABLE " + Config.DB_NAME + ".gene_aliases" )
Database.db.commit( )
insertCount = 0
with gzip.open( Config.EG_GENEINFO, 'r' ) as file :
for line in file.readlines( ) :
line = line.strip( )
# Ignore Header Line
if "#" == line[0] :
continue
splitLine = line.split( "\t" )
sourceID = splitLine[1].strip( )
officialSymbol = splitLine[2].strip( )
systematicName = splitLine[3].strip( )
synonyms = (splitLine[4].strip( )).split( "|" )
if sourceID in existingEntrezGeneIDs :
currentGeneID = existingEntrezGeneIDs[sourceID]
if "-" != officialSymbol :
insertCount = insertCount + 1
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_aliases VALUES ( '0', %s, 'active', 'entrez-official', NOW( ), %s )", [officialSymbol, currentGeneID] )
if "-" != systematicName :
insertCount = insertCount + 1
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_aliases VALUES ( '0', %s, 'active', 'ordered locus', NOW( ), %s )", [systematicName, currentGeneID] )
if "-" != splitLine[4].strip( ) :
insertCount = insertCount + 1
for synonym in synonyms :
synonym = synonym.strip( )
if synonym.lower( ) != officialSymbol.lower( ) and synonym.lower( ) != systematicName.lower( ) :
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_aliases VALUES ( '0', %s, 'active', 'synonym', NOW( ), %s )", [synonym, currentGeneID] )
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'EG_parseAliases', NOW( ) )" )
Database.db.commit( )
sys.exit( )