-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_sample_data.py
More file actions
128 lines (101 loc) · 4.19 KB
/
create_sample_data.py
File metadata and controls
128 lines (101 loc) · 4.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import numpy as np
import pandas as pd
from sklearn.preprocessing import scale
import os
# Set a random seed for reproducibility
np.random.seed(42)
def generate_sample_data(n_samples=500):
"""
Generate synthetic mosquito genomic data with different features for classification.
Parameters:
-----------
n_samples : int
Number of samples to generate
Returns:
--------
pandas.DataFrame
DataFrame containing synthetic genomic data
"""
# Define the species to include
species = ['Anopheles_gambiae', 'Aedes_aegypti', 'Culex_pipiens', 'Anopheles_stephensi']
n_species = len(species)
# Assign roughly equal number of samples to each species
samples_per_species = n_samples // n_species
# Create labels
labels = np.concatenate([np.full(samples_per_species, species[i]) for i in range(n_species)])
# In case there's a remainder due to integer division
if len(labels) < n_samples:
remainder = n_samples - len(labels)
labels = np.concatenate([labels, np.full(remainder, species[0])])
# Shuffle the labels
np.random.shuffle(labels)
# Generate features
# 1. Gene expression data (10 genes with different expression patterns per species)
gene_expressions = np.zeros((n_samples, 10))
# Define mean expression patterns for each species
gene_means = {
'Anopheles_gambiae': np.random.normal(5, 1, 10),
'Aedes_aegypti': np.random.normal(7, 1, 10),
'Culex_pipiens': np.random.normal(3, 1, 10),
'Anopheles_stephensi': np.random.normal(6, 1, 10)
}
# Generate expressions for each sample based on its species
for i, species_name in enumerate(labels):
gene_expressions[i] = np.random.normal(gene_means[species_name], 1)
# Scale gene expressions
gene_expressions = scale(gene_expressions)
# 2. SNP data (20 SNPs)
snp_data = np.zeros((n_samples, 20))
# Different SNP patterns for each species
snp_probs = {
'Anopheles_gambiae': np.random.uniform(0.2, 0.4, 20),
'Aedes_aegypti': np.random.uniform(0.5, 0.7, 20),
'Culex_pipiens': np.random.uniform(0.1, 0.3, 20),
'Anopheles_stephensi': np.random.uniform(0.4, 0.6, 20)
}
# Generate SNP data (0, 1, or 2 representing homozygous reference, heterozygous, homozygous alternate)
for i, species_name in enumerate(labels):
for j in range(20):
p = snp_probs[species_name][j]
# Simplified model of genotypes
probs = [(1-p)**2, 2*p*(1-p), p**2] # Probabilities for 0, 1, 2
snp_data[i, j] = np.random.choice([0, 1, 2], p=probs)
# 3. Morphological characteristics (5 features)
morphology = np.zeros((n_samples, 5))
# Means for morphological features for each species
morph_means = {
'Anopheles_gambiae': [2.5, 0.8, 1.2, 3.3, 0.5],
'Aedes_aegypti': [3.1, 0.6, 1.5, 2.8, 0.7],
'Culex_pipiens': [2.8, 0.7, 1.0, 3.0, 0.4],
'Anopheles_stephensi': [2.6, 0.9, 1.3, 3.2, 0.6]
}
# Generate morphological data
for i, species_name in enumerate(labels):
morphology[i] = np.random.normal(morph_means[species_name], 0.1)
# Combine all features
all_features = np.hstack((gene_expressions, snp_data, morphology))
# Create column names
columns = (
[f'gene_expr_{i}' for i in range(1, 11)] +
[f'snp_{i}' for i in range(1, 21)] +
['body_length', 'wing_width', 'proboscis_length', 'leg_length', 'thorax_width']
)
# Create the DataFrame
df = pd.DataFrame(all_features, columns=columns)
df['species'] = labels
return df
def main():
print("Generating synthetic mosquito genomic dataset...")
df = generate_sample_data(n_samples=500)
# Print dataset summary
print(f"Dataset shape: {df.shape}")
print("\nSpecies distribution:")
print(df['species'].value_counts())
# Save to CSV
output_path = 'mosquito_genomic_data.csv'
df.to_csv(output_path, index=False)
print(f"\nDataset saved to {os.path.abspath(output_path)}")
print(f"First 5 rows of the dataset:")
print(df.head())
if __name__ == "__main__":
main()