Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ _*
_*/**
/data
/raw
/download
figures**/**
download/**

6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ training and evaluation can be found on [HuggingFace](https://huggingface.co/dat

## Citation

> Ontology-aware DNA methylation classification with a curated atlas of human tissues and cell types.
Kim M, Dannenfelser R, Cui Y, Allen G, and Yao V. (2025) BioRxiv.
> Ontology-aware DNA methylation classification with a curated atlas of human tissues and cell types.
Kim M, Dannenfelser R, Cui Y, Allen G, and Yao V. (2025) bioRxiv. https://doi.org/10.1101/2025.04.18.649618

## Getting started

Expand Down Expand Up @@ -36,7 +36,7 @@ downstream analysis and classification scripts.
cd methylation-classification
mkdir download
cd download
huggingface-cli download ylab/methyl-classification
huggingface-cli download ylab/methyl-classification --repo-type dataset --local-dir .
```

To run the feature selection part of our pipeline we require
Expand Down
Binary file added annotation/cv_split.dill
Binary file not shown.
84 changes: 60 additions & 24 deletions src/1-stratify-into-folds.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
"""
1-stratify-into-folds.py
This script sets up the data for analysis.
- input:
- parquet files of preprocessed M-values and metadata
- output:
- pickled files of [training M-values, training metadata]
- pickled files of [full ontology, training ontology]

** To replicate published results, use split indices in ./../annotation/cv_splits.dill
"""
import dill
import os
from sklearn.model_selection import StratifiedGroupKFold
Expand All @@ -7,42 +18,41 @@
import pandas as pd
import networkx as nx

# Path to downloaded parquet files from huggingface
DOWNLOAD_PATH = './../download'
# Path to load/save the annotations/ontologies for easy load
ANNOTATION_PATH = './../annotation'
# Path to save the processed data for easy load
PREPROCESSED_PATH = './../data/GEO/preprocessed'
random.seed(9)

### Relative paths ###
DOWNLOAD_PATH = './../download' # Path to downloaded parquet files from huggingface
ANNOTATION_PATH = './../annotation' # Path to load/save the annotations/ontologies for easy load
PREPROCESSED_PATH = './../data/GEO/preprocessed' # Path to save the processed data for easy load
if not os.path.exists(PREPROCESSED_PATH):
os.makedirs(PREPROCESSED_PATH, exist_ok=True)

### Ontologies ###
# Load ontologies using edgelist and save as dill
full_ontology = nx.read_edgelist(f'{ANNOTATION_PATH}/full_ontology.edgelist', create_using=nx.DiGraph)
training_ontology = nx.read_edgelist(f'{ANNOTATION_PATH}/training_ontology.edgelist', create_using=nx.DiGraph)

with open(f'{ANNOTATION_PATH}/ontologies.dill', 'wb') as f:
dill.dump([full_ontology, training_ontology], f)

### Training data into cross-validation splits ###
# Load the data using pyarrow and save as dill for faster reading
files = {
"Mv": f"{DOWNLOAD_PATH}/training_mvalues.parquet",
"meta": f"{DOWNLOAD_PATH}/training_meta.parquet",
"Mv": f"{DOWNLOAD_PATH}/train.parquet",
"meta": f"{DOWNLOAD_PATH}/metadata.parquet",
}
Mv, meta = [pq.read_table(path).to_pandas() for path in files.values()]
Mv = Mv.set_index('Sample') if 'Sample' in Mv.columns else Mv
meta = meta.set_index('Sample') if 'Sample' in meta.columns else meta
meta = meta.loc[Mv.index]

random.seed(9)
# Random shuffle while being grouped by Dataset
groups = meta['Dataset'].unique()
random.shuffle(groups)

meta = meta.reset_index().rename(columns={'index':'Sample'}).set_index("Dataset").loc[groups].reset_index().set_index("Sample")
Mv = Mv.T.loc[meta.index]

Mv = Mv.loc[meta.index]
print(Mv.shape, meta.shape)

with open(f'{PREPROCESSED_PATH}/training.dill', 'wb') as f:
dill.dump([Mv, meta], f)


# Load ontologies using edgelist and save as dill
full_ontology = nx.read_edgelist(f'{DOWNLOAD_PATH}/full_ontology.edgelist', create_using=nx.DiGraph)
training_ontology = nx.read_edgelist(f'{DOWNLOAD_PATH}/training_ontology.edgelist', create_using=nx.DiGraph)

with open(f'{ANNOTATION_PATH}/ontologies.dill', 'wb') as f:
dill.dump([full_ontology, training_ontology], f)


# Stratify into folds grouped by GSE
sgkf = StratifiedGroupKFold(n_splits=3, shuffle=False, random_state=None)
Expand All @@ -51,7 +61,7 @@
fold_Mvs = dict()
fold_selectors = dict()

for i, (train_index, test_index) in enumerate(sgkf.split(Mv, meta['training.ID'], meta['Dataset'])):
for i, (train_index, test_index) in enumerate(sgkf.split(X=Mv, y=meta['training.ID'], groups=meta['Dataset'])):
print(f"fold {i}:")
print(f"\ttrain: len={len(train_index)}, groups={len(meta['Dataset'][train_index].unique())}")
print(f"\tvalidation: len={len(test_index)}, groups={len(meta['Dataset'][test_index].unique())}")
Expand All @@ -64,10 +74,36 @@
print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")

fold_Mvs[i] = [rest_Mv, rest_meta, holdout_Mv, holdout_meta]

# OR replicate using cv_split.dill
print(f"\nusing cv_splits...")
cv_split = dill.load(open(f"{ANNOTATION_PATH}/cv_split.dill", "rb"))
Mv = Mv.loc[cv_split['all']]
meta = meta.loc[Mv.index]
for fold in range(3):
train_index = cv_split[fold]['train']
test_index = cv_split[fold]['validation']

print(f"fold {fold}:")
print(f"\ttrain: len={len(train_index)}, groups={len(meta.loc[train_index]['Dataset'].unique())}")
print(f"\tvalidation: len={len(test_index)}, groups={len(meta.loc[test_index]['Dataset'].unique())}")

rest_Mv = Mv.loc[train_index]
rest_meta = meta.loc[train_index]
holdout_Mv = Mv.loc[test_index]
holdout_meta = meta.loc[test_index]

print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")

fold_Mvs[fold] = [rest_Mv, rest_meta, holdout_Mv, holdout_meta]

# Double check for leakage
print('\n...print if any overlap GSE between training and validation (should be none)...')
for fold, fold_data in fold_Mvs.items():
print(f"fold{fold}: {set(fold_data[1]['Dataset']).intersection(fold_data[3]['Dataset'])}")

# Save data as dill for faster loading
with open(f'{PREPROCESSED_PATH}/training.dill', 'wb') as f:
dill.dump([Mv, meta], f)
with open(f'{PREPROCESSED_PATH}/training_folds.dill', 'wb') as f:
dill.dump(fold_Mvs, f)
8 changes: 0 additions & 8 deletions src/3-differential-methylation-prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,6 @@
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def load_data():
"""Load methylation data."""
Mv_location = f"{PREPROCESSED_PATH}/training.dill"
logger.info(f"Loading Mv and meta from {Mv_location}")
return dill.load(open(Mv_location, 'rb'))

def load_fold_data():
"""Load cross-validation fold data."""
return dill.load(open(f'{PREPROCESSED_PATH}/training_folds.dill', 'rb'))
Expand Down Expand Up @@ -81,8 +75,6 @@ def batch_correlation(holdout_Mv, rest_Mv, rest_meta, probe_per_tissue):

def main():
# Load data
Mv, meta = load_data()
logger.info(f"{Mv.shape}, {meta.shape}")
fold_Mvs = load_fold_data()

# Store results
Expand Down
6 changes: 0 additions & 6 deletions src/3-differential-methylation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,6 @@
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Load main methylation data."""
Mv_location = f"{PREPROCESSED_PATH}/training.dill"
print(f"loading Mv, meta from {Mv_location}")
return dill.load(open(Mv_location, 'rb'))

def load_fold_data() -> Dict:
"""Load cross-validation fold data."""
try:
Expand Down
2 changes: 0 additions & 2 deletions src/4-final-model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
DATA_PATH = f'./../data/GEO'
# Path to easy load the preprocessed data
PREPROCESSED_PATH = f'{DATA_PATH}/preprocessed'
# Path to load/save the differential methylation results
DIFFMETH_PATH = f"{DATA_PATH}/diffmeth"
# Path to load/save the minipatch results
MINIPATCH_PATH = f"{DATA_PATH}/minipatch"

Expand Down
Loading