-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathEG_parseExternals.py
More file actions
61 lines (43 loc) · 2.33 KB
/
EG_parseExternals.py
File metadata and controls
61 lines (43 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Parse all external database ids for genes from Entrez Gene that
# are relevant to the organisms we want loaded
# via the organisms table.
import Config
import sys, string
import MySQLdb
import Database
import gzip
from classes import EntrezGene
with Database.db as cursor :
entrezGene = EntrezGene.EntrezGene( Database.db, cursor )
existingEntrezGeneIDs = entrezGene.fetchExistingEntrezGeneIDs( )
cursor.execute( "DELETE FROM " + Config.DB_NAME + ".gene_externals WHERE gene_external_source != 'GRID LEGACY'" )
Database.db.commit( )
cursor.execute( "OPTIMIZE TABLE " + Config.DB_NAME + ".gene_externals" )
Database.db.commit( )
insertCount = 0
with gzip.open( Config.EG_GENEINFO, 'r' ) as file :
for line in file.readlines( ) :
line = line.strip( )
# Ignore Header Line
if "#" == line[0] :
continue
splitLine = line.split( "\t" )
sourceID = splitLine[1].strip( )
dbxrefs = (splitLine[5].strip( )).split( "|" )
if sourceID in existingEntrezGeneIDs :
currentGeneID = existingEntrezGeneIDs[sourceID]
if "-" != splitLine[5].strip( ) :
insertCount = insertCount + 1
for dbxref in dbxrefs :
dbxrefInfo = dbxref.split( ":", 1 )
dbxrefInfo[1] = str(dbxrefInfo[1]).upper( ).replace( "HGNC:", "" ).replace( "MGI:", "" ).replace( "RGD:", "" )
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals VALUES ( '0', %s, %s, 'active', NOW( ), %s )", [dbxrefInfo[1].strip( ), dbxrefInfo[0].strip( ).upper( ), currentGeneID] )
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals VALUES ( '0', %s, 'ENTREZ_GENE', 'active', NOW( ), %s )", [sourceID, currentGeneID] )
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals VALUES ( '0', %s, 'ENTREZ_GENE_ETG', 'active', NOW( ), %s )", ['ETG' + str(sourceID), currentGeneID] )
if 0 == (insertCount % Config.DB_COMMIT_COUNT ) :
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_NAME + ".gene_externals SELECT '0', gene_source_id, 'SGD', 'active', NOW( ), gene_id FROM " + Config.DB_NAME + ".genes WHERE gene_source='SGD'" )
Database.db.commit( )
cursor.execute( "INSERT INTO " + Config.DB_STATS + ".update_tracker VALUES ( '0', 'EG_parseExternals', NOW( ) )" )
Database.db.commit( )
sys.exit( )