ylaboratory · miraesunnykim · Apr 25, 2025 · Apr 25, 2025 · May 8, 2025 · May 8, 2025
diff --git a/.gitignore b/.gitignore
@@ -13,5 +13,6 @@ _*
 _*/**
 /data
 /raw
+/download
 figures**/**
-download/**
+
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@ training and evaluation can be found on [HuggingFace](https://huggingface.co/dat
 
 ## Citation
 
-> Ontology-aware DNA methylation classification with a curated atlas of human tissues and cell types.
-Kim M, Dannenfelser R, Cui Y, Allen G, and Yao V. (2025) BioRxiv. 
+> Ontology-aware DNA methylation classification with a curated atlas of human tissues and cell types.  
+Kim M, Dannenfelser R, Cui Y, Allen G, and Yao V. (2025) bioRxiv. https://doi.org/10.1101/2025.04.18.649618
 
 ## Getting started
 
@@ -36,7 +36,7 @@ downstream analysis and classification scripts.
 cd methylation-classification
 mkdir download
 cd download
-huggingface-cli download ylab/methyl-classification
+huggingface-cli download ylab/methyl-classification --repo-type dataset --local-dir .
 ```
 
 To run the feature selection part of our pipeline we require

diff --git a/annotation/cv_split.dill b/annotation/cv_split.dill
diff --git a/src/1-stratify-into-folds.py b/src/1-stratify-into-folds.py
@@ -1,3 +1,14 @@
+"""
+1-stratify-into-folds.py
+This script sets up the data for analysis.
+    - input: 
+        - parquet files of preprocessed M-values and metadata
+    - output: 
+        - pickled files of [training M-values, training metadata]
+        - pickled files of [full ontology, training ontology]
+
+** To replicate published results, use split indices in ./../annotation/cv_splits.dill
+"""
 import dill
 import os
 from sklearn.model_selection import StratifiedGroupKFold
@@ -7,42 +18,41 @@
 import pandas as pd
 import networkx as nx
 
-# Path to downloaded parquet files from huggingface
-DOWNLOAD_PATH = './../download'
-# Path to load/save the annotations/ontologies for easy load
-ANNOTATION_PATH = './../annotation'
-# Path to save the processed data for easy load
-PREPROCESSED_PATH = './../data/GEO/preprocessed'
+random.seed(9)
+
+### Relative paths ###
+DOWNLOAD_PATH = './../download' # Path to downloaded parquet files from huggingface
+ANNOTATION_PATH = './../annotation' # Path to load/save the annotations/ontologies for easy load
+PREPROCESSED_PATH = './../data/GEO/preprocessed' # Path to save the processed data for easy load
 if not os.path.exists(PREPROCESSED_PATH):
     os.makedirs(PREPROCESSED_PATH, exist_ok=True)
+
+### Ontologies ###
+# Load ontologies using edgelist and save as dill
+full_ontology = nx.read_edgelist(f'{ANNOTATION_PATH}/full_ontology.edgelist', create_using=nx.DiGraph)
+training_ontology = nx.read_edgelist(f'{ANNOTATION_PATH}/training_ontology.edgelist', create_using=nx.DiGraph)
+
+with open(f'{ANNOTATION_PATH}/ontologies.dill', 'wb') as f:
+    dill.dump([full_ontology, training_ontology], f)
 
+### Training data into cross-validation splits ###
 # Load the data using pyarrow and save as dill for faster reading
 files = {
-    "Mv": f"{DOWNLOAD_PATH}/training_mvalues.parquet",
-    "meta": f"{DOWNLOAD_PATH}/training_meta.parquet",
+    "Mv": f"{DOWNLOAD_PATH}/train.parquet",
+    "meta": f"{DOWNLOAD_PATH}/metadata.parquet",
 }
 Mv, meta = [pq.read_table(path).to_pandas() for path in files.values()]
+Mv = Mv.set_index('Sample') if 'Sample' in Mv.columns else Mv
+meta = meta.set_index('Sample') if 'Sample' in meta.columns else meta
+meta = meta.loc[Mv.index]
 
-random.seed(9)
+# Random shuffle while being grouped by Dataset
 groups = meta['Dataset'].unique()
 random.shuffle(groups)
-
 meta = meta.reset_index().rename(columns={'index':'Sample'}).set_index("Dataset").loc[groups].reset_index().set_index("Sample")
-Mv = Mv.T.loc[meta.index]
-
+Mv = Mv.loc[meta.index]
 print(Mv.shape, meta.shape)
 
-with open(f'{PREPROCESSED_PATH}/training.dill', 'wb') as f:
-    dill.dump([Mv, meta], f)
-
-
-# Load ontologies using edgelist and save as dill
-full_ontology = nx.read_edgelist(f'{DOWNLOAD_PATH}/full_ontology.edgelist', create_using=nx.DiGraph)
-training_ontology = nx.read_edgelist(f'{DOWNLOAD_PATH}/training_ontology.edgelist', create_using=nx.DiGraph)
-
-with open(f'{ANNOTATION_PATH}/ontologies.dill', 'wb') as f:
-    dill.dump([full_ontology, training_ontology], f)
-
 
 # Stratify into folds grouped by GSE
 sgkf = StratifiedGroupKFold(n_splits=3, shuffle=False, random_state=None)
@@ -51,7 +61,7 @@
 fold_Mvs = dict()
 fold_selectors = dict()
 
-for i, (train_index, test_index) in enumerate(sgkf.split(Mv, meta['training.ID'], meta['Dataset'])):
+for i, (train_index, test_index) in enumerate(sgkf.split(X=Mv, y=meta['training.ID'], groups=meta['Dataset'])):
     print(f"fold {i}:")
     print(f"\ttrain: len={len(train_index)}, groups={len(meta['Dataset'][train_index].unique())}")
     print(f"\tvalidation:  len={len(test_index)}, groups={len(meta['Dataset'][test_index].unique())}")
@@ -64,10 +74,36 @@
     print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")
 
     fold_Mvs[i] = [rest_Mv, rest_meta, holdout_Mv, holdout_meta]
+
+# OR replicate using cv_split.dill
+print(f"\nusing cv_splits...")
+cv_split = dill.load(open(f"{ANNOTATION_PATH}/cv_split.dill", "rb"))
+Mv = Mv.loc[cv_split['all']]
+meta = meta.loc[Mv.index]
+for fold in range(3):
+    train_index = cv_split[fold]['train']
+    test_index = cv_split[fold]['validation']
+
+    print(f"fold {fold}:")
+    print(f"\ttrain: len={len(train_index)}, groups={len(meta.loc[train_index]['Dataset'].unique())}")
+    print(f"\tvalidation:  len={len(test_index)}, groups={len(meta.loc[test_index]['Dataset'].unique())}")
 
+    rest_Mv = Mv.loc[train_index]
+    rest_meta = meta.loc[train_index]
+    holdout_Mv = Mv.loc[test_index]
+    holdout_meta = meta.loc[test_index]
+
+    print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")
+
+    fold_Mvs[fold] = [rest_Mv, rest_meta, holdout_Mv, holdout_meta]
+
+# Double check for leakage 
 print('\n...print if any overlap GSE between training and validation (should be none)...')
 for fold, fold_data in fold_Mvs.items():
     print(f"fold{fold}: {set(fold_data[1]['Dataset']).intersection(fold_data[3]['Dataset'])}")
 
+# Save data as dill for faster loading
+with open(f'{PREPROCESSED_PATH}/training.dill', 'wb') as f:
+    dill.dump([Mv, meta], f)
 with open(f'{PREPROCESSED_PATH}/training_folds.dill', 'wb') as f:
     dill.dump(fold_Mvs, f)
diff --git a/src/3-differential-methylation-prediction.py b/src/3-differential-methylation-prediction.py
@@ -32,12 +32,6 @@
 random.seed(RANDOM_SEED)
 np.random.seed(RANDOM_SEED)
 
-def load_data():
-    """Load methylation data."""
-    Mv_location = f"{PREPROCESSED_PATH}/training.dill"
-    logger.info(f"Loading Mv and meta from {Mv_location}")
-    return dill.load(open(Mv_location, 'rb'))
-
 def load_fold_data():
     """Load cross-validation fold data."""
     return dill.load(open(f'{PREPROCESSED_PATH}/training_folds.dill', 'rb'))
@@ -81,8 +75,6 @@ def batch_correlation(holdout_Mv, rest_Mv, rest_meta, probe_per_tissue):
 
 def main():
     # Load data
-    Mv, meta = load_data()
-    logger.info(f"{Mv.shape}, {meta.shape}")
     fold_Mvs = load_fold_data()
 
     # Store results

diff --git a/src/3-differential-methylation.py b/src/3-differential-methylation.py
@@ -31,12 +31,6 @@
 random.seed(RANDOM_SEED)
 np.random.seed(RANDOM_SEED)
 
-def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
-    """Load main methylation data."""
-    Mv_location = f"{PREPROCESSED_PATH}/training.dill"
-    print(f"loading Mv, meta from {Mv_location}")
-    return dill.load(open(Mv_location, 'rb'))
-
 def load_fold_data() -> Dict:
     """Load cross-validation fold data."""
     try:

diff --git a/src/4-final-model.py b/src/4-final-model.py
@@ -24,8 +24,6 @@
 DATA_PATH = f'./../data/GEO'
 # Path to easy load the preprocessed data
 PREPROCESSED_PATH = f'{DATA_PATH}/preprocessed'
-# Path to load/save the differential methylation results
-DIFFMETH_PATH  = f"{DATA_PATH}/diffmeth"
 # Path to load/save the minipatch results
 MINIPATCH_PATH  = f"{DATA_PATH}/minipatch"