SynteractTurbo/npy_to_sql.py at main · Gleghorn-Lab/SynteractTurbo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
import numpy as np
import sqlite3
import pandas as pd
import os
from tqdm.auto import tqdm


def convert_npy_to_sql(npy_path, db_path):
    """
    Convert .npy file containing pairs and measurements to SQLite database.

    Args:
        npy_path (str): Path to the .npy file
        db_path (str): Path where the SQLite database will be created
    """
    print(f"Loading data from {npy_path}...")

    # Load the .npy file
    loaded_data = np.load(npy_path, allow_pickle=True).item()
    measurements = loaded_data['measurements']
    pairs = loaded_data['pairs']

    print(f"Found {len(pairs)} pairs with {len(measurements)} measurements")

    # Create SQLite database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Create table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS protein_pairs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            protein1 TEXT NOT NULL,
            protein2 TEXT NOT NULL,
            score INTEGER NOT NULL
        )
    ''')

    # Create indexes for faster querying
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_protein1 ON protein_pairs(protein1)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_protein2 ON protein_pairs(protein2)')
    cursor.execute('CREATE INDEX IF NOT EXISTS idx_both_proteins ON protein_pairs(protein1, protein2)')

    print("Inserting data into database...")

    # Insert data in batches for better performance
    batch_size = 10000
    data_to_insert = []

    for i, (pair, score) in tqdm(enumerate(zip(pairs, measurements)), total=len(pairs), desc="Inserting data"):
        protein1, protein2 = pair
        data_to_insert.append((protein1, protein2, int(score)))

        # Insert in batches
        if len(data_to_insert) >= batch_size:
            cursor.executemany(
                'INSERT INTO protein_pairs (protein1, protein2, score) VALUES (?, ?, ?)',
                data_to_insert
            )
            data_to_insert = []

    # Insert remaining data
    if data_to_insert:
        cursor.executemany(
            'INSERT INTO protein_pairs (protein1, protein2, score) VALUES (?, ?, ?)',
            data_to_insert
        )

    conn.commit()
    conn.close()

    print(f"Database created successfully at {db_path}")


def query_protein_pairs(db_path, protein_name, min_score=None, max_score=None):
    """
    Query the database for all pairs involving a specific protein.

    Args:
        db_path (str): Path to the SQLite database
        protein_name (str): The protein name to search for
        min_score (int, optional): Minimum score threshold
        max_score (int, optional): Maximum score threshold

    Returns:
        pandas.DataFrame: DataFrame containing all pairs and scores involving the protein
    """
    conn = sqlite3.connect(db_path)

    # Build the query
    query = '''
        SELECT protein1, protein2, score
        FROM protein_pairs
        WHERE protein1 = ? OR protein2 = ?
    '''
    params = [protein_name, protein_name]

    # Add score filters if provided
    if min_score is not None:
        query += ' AND score >= ?'
        params.append(min_score)

    if max_score is not None:
        query += ' AND score <= ?'
        params.append(max_score)

    query += ' ORDER BY score DESC'

    # Execute query and return as DataFrame
    df = pd.read_sql_query(query, conn, params=params)
    conn.close()

    return df


def get_database_stats(db_path):
    """
    Get basic statistics about the database.

    Args:
        db_path (str): Path to the SQLite database

    Returns:
        dict: Dictionary containing database statistics
    """
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Total number of pairs
    cursor.execute('SELECT COUNT(*) FROM protein_pairs')
    total_pairs = cursor.fetchone()[0]

    # Unique proteins
    cursor.execute('SELECT COUNT(DISTINCT protein1) + COUNT(DISTINCT protein2) FROM protein_pairs')
    # More accurate count of unique proteins
    cursor.execute('''
        SELECT COUNT(DISTINCT protein) FROM (
            SELECT protein1 as protein FROM protein_pairs
            UNION
            SELECT protein2 as protein FROM protein_pairs
        )
    ''')
    unique_proteins = cursor.fetchone()[0]

    # Score statistics
    cursor.execute('SELECT MIN(score), MAX(score), AVG(score) FROM protein_pairs')
    min_score, max_score, avg_score = cursor.fetchone()

    conn.close()

    return {
        'total_pairs': total_pairs,
        'unique_proteins': unique_proteins,
        'min_score': min_score,
        'max_score': max_score,
        'avg_score': round(avg_score, 2) if avg_score else None
    }


def search_proteins_by_pattern(db_path, pattern):
    """
    Search for proteins matching a pattern.

    Args:
        db_path (str): Path to the SQLite database
        pattern (str): SQL LIKE pattern (use % as wildcard)

    Returns:
        list: List of unique proteins matching the pattern
    """
    conn = sqlite3.connect(db_path)

    query = '''
        SELECT DISTINCT protein FROM (
            SELECT protein1 as protein FROM protein_pairs WHERE protein1 LIKE ?
            UNION
            SELECT protein2 as protein FROM protein_pairs WHERE protein2 LIKE ?
        ) ORDER BY protein
    '''

    df = pd.read_sql_query(query, conn, params=[pattern, pattern])
    conn.close()

    return df['protein'].tolist()


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Convert .npy file to SQLite database")
    parser.add_argument("--npy_path", type=str, required=True, help="Path to the .npy file")
    parser.add_argument("--db_path", type=str, required=True, help="Path to the SQLite database")
    args = parser.parse_args()

    npy_path = args.npy_path
    db_path = args.db_path

    if not os.path.exists(db_path):
        print("Converting .npy file to SQLite database...")
        convert_npy_to_sql(npy_path, db_path)
    else:
        print(f"Database already exists at {db_path}")

    print("\nDatabase Statistics:")
    stats = get_database_stats(db_path)
    for key, value in stats.items():
        print(f"  {key}: {value}")

    # Example queries
    print("\n" + "="*50)
    print("EXAMPLE USAGE:")
    print("="*50)

    # Get a sample protein for demonstration
    conn = sqlite3.connect(db_path)
    sample_protein = pd.read_sql_query('SELECT protein1 FROM protein_pairs LIMIT 1', conn)['protein1'].iloc[0]
    conn.close()

    print(f"\nExample 1: Query all pairs for protein '{sample_protein}':")
    result = query_protein_pairs(db_path, sample_protein)
    print(f"Found {len(result)} pairs involving this protein")
    print(result.head())

    result.to_csv('csv_path.csv', index=False)

    min_score = 50
    print(f"\nExample 2: Query pairs for '{sample_protein}' with score >= {min_score}:")
    result_filtered = query_protein_pairs(db_path, sample_protein, min_score=min_score)
    print(f"Found {len(result_filtered)} with score >= {min_score}")
    if len(result_filtered) > 0:
        print(result_filtered.head())