diff --git a/README.md b/README.md index d11963b..422c24c 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ Although clinical applications represent the next challenge in single-cell genom ![plot](./img/plot.png) -Current version for PILOT is 2.0.6 +Current version for PILOT is 2.0.7 ## Installation The easiest way to install PILOT and the required packages is using the following way: @@ -15,10 +15,9 @@ The easiest way to install PILOT and the required packages is using the followin ```terminal git clone https://github.com/CostaLab/PILOT.git cd PILOT -conda create --name PILOT python=3.11.5 r-base +conda create --name PILOT python=3.14.3 conda activate PILOT pip install . -conda install -c conda-forge rpy2 ``` Once you've completed these steps, you can proceed to run the tutorials and explore the features of PILOT. When doing so, remember to move to the tutorial folder, as all the work will be performed there: @@ -52,4 +51,3 @@ You can access the used data sets by PILOT in Part 1 [![DOI](https://zenodo.org/ - diff --git a/pilotpy/plot/ploting.py b/pilotpy/plot/ploting.py index e3885df..c3668d9 100644 --- a/pilotpy/plot/ploting.py +++ b/pilotpy/plot/ploting.py @@ -1,45 +1,59 @@ -import pandas as pd -import numpy as np -import anndata -import scanpy as sc import os -import seaborn as sns -import scipy -from matplotlib import pyplot as plt +import warnings +from pathlib import Path + +# Data Science +import numpy as np +import pandas as pd from scipy.stats import ttest_ind +from sklearn import metrics from statsmodels.stats.multitest import multipletests -from adjustText import adjust_text -from matplotlib.lines import Line2D + +# Single-Cell/Bio +import anndata +import elpigraph +import scanpy as sc from gprofiler import GProfiler -import pydiffmap -from sklearn import metrics from pydiffmap import diffusion_map -from sklearn.neighbors import NearestCentroid -from sklearn.metrics.cluster import rand_score -from scipy.spatial import distance -from sknetwork.clustering import Louvain -from sklearn.preprocessing import label_binarize -import time -import sys -from genericpath import isfile -from matplotlib.image import imread -import warnings -import ot -from logging import info, warn + +# Plotting/UI +import seaborn as sns +import textwrap as tw +from adjustText import adjust_text from cycler import cycler -warnings.filterwarnings('ignore') -import elpigraph -from scipy.stats import ttest_ind +from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties +from matplotlib.image import imread +from matplotlib.lines import Line2D + from ..tools.Gene_cluster_specific_functions import * +# Configurations +warnings.filterwarnings('ignore') -def trajectory(adata,n_evecs = 2, epsilon =1, alpha = 0.5,knn= 64, sample_col=1, clusters = 'status',label_act = False,colors=['#377eb8','#ff7f00','#e41a1c'],location_labels='center', figsize=(12,12),font_size=24,axes_line_width=1,axes_color='black',facecolor='white',point_size=100,cmap='viridis',fontsize_legend=24,alpha_trans=1,plot_titel = "Trajectory of the disease progression"): - - - - + + +def trajectory(adata, + n_evecs = 2, + epsilon =1, + alpha = 0.5, + knn= 64, + sample_col=1, + clusters = 'status', + label_act = False, + colors=['#377eb8','#ff7f00','#e41a1c'], + location_labels='center', + figsize=(12,12), + font_size=24, + axes_line_width=1, + axes_color='black', + facecolor='white', + point_size=100, + cmap='viridis', + fontsize_legend=24, + alpha_trans=1, + plot_titel = "Trajectory of the disease progression"): """ Find trajectories using Diffusion Maps and visualize the trajectory plot. @@ -95,18 +109,17 @@ def trajectory(adata,n_evecs = 2, epsilon =1, alpha = 0.5,knn= 64, sample_col=1, EMD=adata.uns['EMD']/adata.uns['EMD'].max() df=adata.uns['annot'] - path='Results_PILOT/plots' - - if not os.path.exists(path): - os.makedirs(path) - + plot_path=Path('Results_PILOT/plots') + plot_path.mkdir(parents=True, exist_ok=True) custom_cycler = (cycler(color=colors)) - plot_titel = plot_titel + plot_title = plot_titel - - mydmap = diffusion_map.DiffusionMap.from_sklearn(n_evecs = n_evecs, epsilon =epsilon, alpha = alpha, k=knn) + mydmap = diffusion_map.DiffusionMap.from_sklearn(n_evecs = n_evecs, + epsilon =epsilon, + alpha = alpha, + k=knn) embedding = mydmap.fit_transform(EMD) plt.rcParams.update({'font.size': font_size}) @@ -134,16 +147,15 @@ def trajectory(adata,n_evecs = 2, epsilon =1, alpha = 0.5,knn= 64, sample_col=1, k=k+1 ax.legend(loc=location_labels, fontsize=fontsize_legend) - plt.title(plot_titel) - plt.savefig(path+"/"+plot_titel+'.pdf') + plt.title(plot_title) + plt.savefig(plot_path / f"{plot_title}.pdf") plt.show() plt.close(fig) adata.uns['embedding']=embedding - - + def heatmaps(adata,figsize=(12,12),col_cluster=True,row_cluster=True,cmap='Blues_r',font_scale=2): """ @@ -173,16 +185,16 @@ def heatmaps(adata,figsize=(12,12),col_cluster=True,row_cluster=True,cmap='Blues annot=adata.uns['annot'] cost=adata.uns['cost'] - path='Results_PILOT/plots' + plot_path=Path('Results_PILOT/plots') - if not os.path.exists(path): - os.makedirs(path) + plot_path.mkdir(parents=True, exist_ok=True) + fig = plt.figure() # sns.set(font_scale=font_scale) sns.clustermap(cost[annot.cell_type.unique()],cmap=cmap,figsize=figsize,col_cluster=col_cluster,row_cluster=row_cluster); plt.title('Cost Matrix',loc='center') - plt.savefig(path+'/Cost_matrix.pdf') + plt.savefig(plot_path / 'Cost matrix.pdf') plt.close(fig) fig = plt.figure() @@ -190,11 +202,10 @@ def heatmaps(adata,figsize=(12,12),col_cluster=True,row_cluster=True,cmap='Blues emd=adata.uns['EMD_df'] sns.clustermap(emd,cmap=cmap,figsize=figsize,col_cluster=col_cluster,row_cluster=row_cluster) plt.title('Wasserstein distance',loc='center') - plt.savefig(path+'/Wasserstein distance.pdf') + plt.savefig(plot_path / 'Wasserstein distance.pdf') plt.close(fig) - def heatmaps_df(df, figsize=(12, 12), col_cluster=True, row_cluster=True, cmap='Blues_r'): """ Plot heatmaps of cost matrix and Wasserstein distances. @@ -218,12 +229,15 @@ def heatmaps_df(df, figsize=(12, 12), col_cluster=True, row_cluster=True, cmap=' Plots and saves heatmaps based on the input DataFrame. """ - path='Results_PILOT/plots' - sns.clustermap(df, - row_cluster=row_cluster,col_cluster=col_cluster,annot=False,cmap=cmap,figsize=figsize,xticklabels=True); - plt.savefig(path+"/"+'Proportions_of_cell_types_for_samples_over_trajectory.pdf') - - + plot_path=Path('Results_PILOT/plots') + sns.clustermap(df, + row_cluster=row_cluster, + col_cluster=col_cluster, + annot=False, + cmap=cmap, + figsize=figsize, + xticklabels=True); + plt.savefig(plot_path / 'Proportions of cell types for samples over trajectory.pdf') def fit_pricipla_graph(adata,NumNodes=20,source_node=0,show_text=True,Do_PCA=False,figsize=(12,12),X_color='r', Node_color='k', DimToPlot=[0, 1],facecolor='white',title='Principal graph'): @@ -262,11 +276,9 @@ def fit_pricipla_graph(adata,NumNodes=20,source_node=0,show_text=True,Do_PCA=Fal Fits an Elastic Principal Graph, plots it, and extracts pseudotime information. """ - path='Results_PILOT/plots' - - if not os.path.exists(path): - os.makedirs(path) - + plot_path=Path('Results_PILOT/plots' ) + plot_path.mkdir(parents=True, exist_ok=True) + emb=adata.uns['embedding'] pg_curve = elpigraph.computeElasticPrincipalTree(emb,NumNodes=NumNodes)[0] fig = plt.figure(figsize=figsize) @@ -274,7 +286,7 @@ def fit_pricipla_graph(adata,NumNodes=20,source_node=0,show_text=True,Do_PCA=Fal ax.set(facecolor = facecolor) elpigraph.plot.PlotPG(emb,pg_curve,Do_PCA=Do_PCA,show_text=show_text,DimToPlot=DimToPlot,Node_color=Node_color,X_color=X_color) plt.title(title) - plt.savefig(path+"/"+'Principal graph'+'.pdf') + plt.savefig(plot_path / 'Principal graph.pdf') plt.show() plt.close(fig) elpigraph.utils.getPseudotime(emb,pg_curve,source=source_node,target=None) @@ -282,29 +294,42 @@ def fit_pricipla_graph(adata,NumNodes=20,source_node=0,show_text=True,Do_PCA=Fal adata.uns['pseudotime']=pseudotime - - def clustering_emd(adata,res=0.3,metric='cosine',groupby_col='Leiden',swap_axes=False,cmap="Blues_r",dendrogram=True,show_gene_labels=True,var_group_rotation=45,figsize=[12,12],save=False,sorter_leiden=None): """ Perform clustering and visualization of EMD (Earth Mover's Distance) data in AnnData object. - Parameters: - adata (AnnData): Input AnnData object containing EMD data. - res (float): Resolution parameter for Leiden clustering. Default is 0.3. - metric (str): Distance metric for clustering. Default is 'cosine'. - groupby_col (str): Grouping variable for plotting. 'Leiden' groups by predicted clusters, 'status' groups by real labels. Default is 'Leiden'. - swap_axes (bool): Swap the axes in the heatmap. Default is False. - cmap (str): Colormap for the heatmap. Default is "Blues_r". - dendrogram (bool): Display dendrograms in the heatmap. Default is True. - show_gene_labels (bool): Show gene labels in the heatmap. Default is True. - var_group_rotation (int): Rotation angle for gene labels. Default is 45 degrees. - figsize (list): Size of the heatmap figure. Default is [12, 12]. - save (bool): Save the heatmap figure. Default is False. - sorter_leiden (list or None): Custom order for Leiden clusters. If not provided, the default order is used. + Parameters + ---------- + adata : AnnData + Input AnnData object containing EMD data. + res : float, optional + Resolution parameter for Leiden clustering. Default is 0.3. + metric : str, optional + Distance metric for clustering. Default is 'cosine'. + groupby_col : str, optional + Grouping variable for plotting. 'Leiden' groups by predicted clusters, 'status' groups by real labels. Default is 'Leiden'. + swap_axes : bool, optional + Swap the axes in the heatmap. Default is False. + cmap : str, optional + Colormap for the heatmap. Default is "Blues_r". + dendrogram : bool, optional + Display dendrograms in the heatmap. Default is True. + show_gene_labels : bool, optional + Show gene labels in the heatmap. Default is True. + var_group_rotation : int, optional + Rotation angle for gene labels. Default is 45 degrees. + figsize : list, optional + Size of the heatmap figure. Default is [12, 12]. + save : bool, optional + Save the heatmap figure. Default is False. + sorter_leiden : list or None, optional + Custom order for Leiden clusters. If not provided, the default order is used. - Returns: - proportion_df (DataFrame): DataFrame containing proportions of sub-clusters in each sample. + Returns + ------- + proportion_df : pd.DataFrame + DataFrame containing proportions of sub-clusters in each sample. """ EMD=adata.uns['EMD'] @@ -354,10 +379,6 @@ def clustering_emd(adata,res=0.3,metric='cosine',groupby_col='Leiden',swap_axes= return proportion_df - - - - def Sil_computing(EMD, real_labels, metric='cosine'): """ Compute the Silhouette score based on Wasserstein distances. @@ -410,11 +431,10 @@ def select_best_sil(adata,resolutions=[],marker='o',figsize=(10,10),facecolor="w best_res=start best_sil=0 if path==None: - if not os.path.exists('Results_PILOT/plots'): - os.makedirs('Results_PILOT/plots') - path_to_results='Results_PILOT/plots' + result_path=Path('Results_PILOT/plots') + result_path.mkdir(parents=True, exist_ok=True) else: - path_to_results=path + result_path=Path(path) sil_scores = [] number_of_clusters=[] EMD=adata.uns['EMD'] @@ -444,7 +464,7 @@ def select_best_sil(adata,resolutions=[],marker='o',figsize=(10,10),facecolor="w plt.xlabel('Resolution') plt.ylabel('Silhouette Score') plt.title('Silhouette Score vs. Resolution') - plt.savefig(path_to_results+'/silhouette_score_vs_resolution.pdf') + plt.savefig(result_path / 'Silhouette score VS resolution.pdf') plt.show() @@ -454,9 +474,10 @@ def select_best_sil(adata,resolutions=[],marker='o',figsize=(10,10),facecolor="w plt.xlabel('Resolution') plt.ylabel('Number of Clusters') plt.title('Number of Clusters vs. Resolution') - plt.savefig(path_to_results+'/number_of_clusters_vs_resolution.pdf') + plt.savefig(result_path / 'Number of clusters VS resolution.pdf') plt.show() + def cell_type_diff_two_sub_patient_groups(proportions: pd.DataFrame = None, cell_types: list = None, labels:str = 'Predicted_Labels', @@ -497,11 +518,10 @@ def cell_type_diff_two_sub_patient_groups(proportions: pd.DataFrame = None, """ if file_path==None: - if not os.path.exists('Results_PILOT/plots'): - os.makedirs('Results_PILOT/plots') - file_path='Results_PILOT/plots' + plot_path=Path('Results_PILOT/plots') + plot_path.mkdir(parents=True, exist_ok=True) else: - file_path=file_path + plot_path=Path(file_path) @@ -535,10 +555,9 @@ def cell_type_diff_two_sub_patient_groups(proportions: pd.DataFrame = None, ## as positive score shows cell types statistically significant on the first group ## and negative scrore shows cell types statistically significant on the second group - - if not os.path.exists('Results_PILOT/Diff_Expressions_Results'): - os.makedirs('Results_PILOT/Diff_Expressions_Results') - stats_bar_group12.to_csv('Results_PILOT/Diff_Expressions_Results' + "/Cell_type_diff_" + group1 + "_vs_" + group2 +".csv", + diff_result_path = Path('Results_PILOT/Diff_Expressions_Results') + diff_result_path.mkdir(parents=True, exist_ok=True) + stats_bar_group12.to_csv(diff_result_path / f"Cell_type_diff_{group1}_vs_{group2}.csv", header = True, index = None) # filter data based on a p-value threshold @@ -549,18 +568,20 @@ def cell_type_diff_two_sub_patient_groups(proportions: pd.DataFrame = None, plot_hor_vs_vert(stats_bar_group12, 1, x = 'score', y = 'cell_type', c = 'type', xlabel = 'statistic score', ylabel = None, rotation = None, tick_bottom = True, tick_left = False, - title = "Cell type rank " + group1 + " vs " + group2,fontsize=fontsize) + title = f"Cell type rank {group1} vs {group2}",fontsize=fontsize) fig.tight_layout() - plt.savefig(file_path + "/Cell_type_diff_" + group1 + "_vs_" + group2 +".pdf", + plt.savefig(plot_path / f"Cell type diff {group1} VS {group2}.pdf", facecolor = 'white') - - + def plot_cell_types_distributions(proportions: pd.DataFrame = None, cell_types: list = None, labels:str = 'Predicted_Labels', file_path: str = None, - figsize: tuple = (15, 7),label_order=None,label_colors=None,fontsize=24,rotation = 45): + figsize: tuple = (15, 7), + label_order=None, + label_colors=None, + fontsize=24,rotation = 45): """ @@ -588,11 +609,10 @@ def plot_cell_types_distributions(proportions: pd.DataFrame = None, """ if file_path==None: - if not os.path.exists('Results_PILOT/plots'): - os.makedirs('Results_PILOT/plots') - file_path='Results_PILOT/plots' + plot_path=Path('Results_PILOT/plots') + plot_path.mkdir(parents=True, exist_ok=True) else: - file_path=file_path + plot_path=file_path col_names = list(cell_types) col_names.append(labels) @@ -612,115 +632,153 @@ def plot_cell_types_distributions(proportions: pd.DataFrame = None, plt.xticks(fontsize = fontsize, rotation = rotation, ha = 'right', rotation_mode = 'anchor') plt.legend(fontsize = fontsize) fig.tight_layout() - plt.savefig(file_path + "/Cell_types_distributions.pdf", + plt.savefig(plot_path / "Cell types - distributions.pdf", facecolor = 'white') plt.show() - - -def gene_annotation_cell_type_subgroup(cell_type: str = None, - group: str = None, - source: str = None, - num_gos: int = 15, - figsize=(12,12), - font_size: int = 24, - bbox_inches: str = 'tight', - facecolor: str = 'white', - transparent: bool = False, - organism: str = 'hsapiens', - dpi: int = 100, - s: int = 200, - color: str = 'tab:blue'): +def gene_annotation_cell_type_subgroup( + data: pd.DataFrame = None, + cell_type: str = "Unknown", + group: str = None, + symbol: str = 'gene', + sig_col: str = 'significant_gene', + sources: list = None, + num_gos: int = 10, + figsize: tuple = (6, 4), + font_size: int = 12, + max_length: int = 50, + path_to_results: str = 'Results_PILOT', + my_pal: dict = None, + save_plot: bool = True, + organism: str = 'hsapiens', + marker_size: int = 300 +): """ - Perform Gene Ontology (GO) enrichment analysis and create a scatterplot of enriched terms. + Performs GO enrichment analysis for a specific cell-type subgroup and + visualizes the top enriched terms. - Parameters: + Parameters ---------- + data : pandas.DataFrame, optional + A dataframe containing gene significance labels. If None, the function + attempts to load genes from a CSV file based on the `path_to_results`. cell_type : str, optional - Specify cell type name to check its differential expression genes. The default is None. + The specific cell type being analyzed. Used for path naming and + plot titles. Default is "Unknown". group : str, optional - Name of patients sub-group of interest. The default is None. - source : str, optional - Specify the source of GO terms. The default is None. - num_gos: int, optional - Number of GO terms to plot. Default is 5. - figsize: tuple, optional - figsize. Default is (12,12). - font_size: int, optional - Font size for labels. Default is 24. - bbox_inches: str, optional - Bounding box for saving the plot. Default is 'tight'. - facecolor: str, optional - Background color of the figure. Default is 'white'. - transparent: bool, optional - Set to True for a transparent figure. Default is False. - organism: str, optional - The organism for GO analysis. Default is 'hsapiens'. - dpi: int, optional - Dots per inch for the saved plot image. Default is 100. - s: int, optional - Marker size for the scatterplot. Default is 200. - color: str, optional - Color of the scatterplot markers. Default is 'tab:blue'. + The specific group label (e.g., "Upregulated") to filter for in + `sig_col`. + symbol : str, optional + The column name in `data` containing gene symbols. Default is 'gene'. + sig_col : str, optional + The column name in `data` that defines the significance group. + Default is 'significant_gene'. + sources : list, optional + The g:Profiler data sources to query (e.g., ["GO:BP", "KEGG"]). + If None, queries all default sources. + num_gos : int, optional + The maximum number of top GO terms to display in the plot. Default is 10. + figsize : tuple, optional + The dimensions of the output plot. Default is (6, 4). + font_size : int, optional + The base font size for plot text elements. Default is 12. + max_length : int, optional + The character limit for wrapping GO term names on the y-axis. + Default is 50. + path_to_results : str, optional + The root directory for reading data and saving results. + Default is 'Results_PILOT'. + my_pal : dict, optional + A dictionary mapping group names to specific colors. + Default is None (defaults to 'tab:blue'). + save_plot : bool, optional + Whether to save the plot as a PDF and the full results as a CSV. + Default is True. + organism : str, optional + The organism ID for g:Profiler (e.g., 'hsapiens', 'mmusculus'). + Default is 'hsapiens'. + marker_size : int, optional + The size of the dots in the scatter plot. Default is 300. - Returns: - -------- - None - Saves the scatterplot of enriched GO terms as a PDF file. + Returns + ------- + None or str + Displays a plot and saves files if `save_plot` is True. + Returns an error string if no data is found or enrichment fails. """ - - path_to_results='Results_PILOT' - group_genes = pd.read_csv(path_to_results +'/Diff_Expressions_Results/'+cell_type+"/significant_genes_"+cell_type+"_"+group+".csv", - index_col=0) + base_path = Path(path_to_results) + go_path = base_path / 'Diff_Expressions_Results' / cell_type / 'GO_analysis' - - gp = GProfiler(return_dataframe=True) - if list(group_genes['0'].values): - gprofiler_results = gp.profile(organism = organism, - query = list(group_genes['0'].values)) + # Plotting - color palette + if my_pal and group in my_pal: + plot_color = my_pal[group] else: - return "Genes list is empty!" - - - if(gprofiler_results.shape[0] == 0): - return "Not enough information!" + plot_color = 'tab:blue' - - if(gprofiler_results.shape[0] < num_gos): - num_gos = gprofiler_results.shape[0] - - if source: - gprofiler_results = gprofiler_results[gprofiler_results['source']==source] - - - - selected_gps = gprofiler_results.head(num_gos)[['name', 'p_value']] - - selected_gps['nlog10'] = -np.log10(selected_gps['p_value'].values) + # Get query genes + if data is not None: + query_genes = data.loc[data[sig_col] == group, symbol].dropna().unique().tolist() + else: + # Load from CSV if no DataFrame provided + file_path = base_path / 'Diff_Expressions_Results' / cell_type / f"Significant_genes_{cell_type}_{group}.csv" + if not file_path.exists(): + return f"Error: No data provided and file not found at {file_path}" + + df_load = pd.read_csv(file_path, index_col=0) + query_genes = df_load['0'].dropna().tolist() if '0' in df_load.columns else df_load.index.dropna().tolist() - plt.figure(figsize = figsize, dpi = dpi) - plt.style.use('default') - sns.scatterplot(data = selected_gps, x= "nlog10", y= "name", s = s, color = color) + if not query_genes: + return f"No significant genes found for group: {group}" - plt.title('GO enrichment in ' + cell_type + ' associated with ' + group, fontsize = font_size) + # GProfiler Analysis + gp = GProfiler(return_dataframe=True) + gprofiler_results = gp.profile( + organism=organism, + query=query_genes, + no_evidences=False, + sources=sources + ) + + if gprofiler_results is None or gprofiler_results.empty: + return "Not enough GO information found for these genes." + + # Data preparation for plotting + num_gos = min(num_gos, gprofiler_results.shape[0]) + selected_gps = gprofiler_results.head(num_gos).copy() + selected_gps['nlog10'] = -np.log10(selected_gps['p_value'].values) - plt.xticks(size = font_size) - plt.yticks(size = font_size) + # Wrap labels for better fit + selected_gps['name'] = selected_gps['name'].apply(lambda x: "\n".join(tw.wrap(x, max_length))) - plt.ylabel("GO Terms", size = font_size) - plt.xlabel("-$log_{10}$ (P-value)", size = font_size) - - if not os.path.exists(path_to_results+'/Diff_Expressions_Results/'+cell_type+'/GO_analysis/'): - os.makedirs(path_to_results+'/Diff_Expressions_Results/'+cell_type+'/GO_analysis/') - path_to_results=path_to_results+'/Diff_Expressions_Results/'+cell_type+'/GO_analysis/' - plt.savefig(path_to_results+ group + ".pdf", bbox_inches = bbox_inches, facecolor=facecolor, transparent=transparent) - gprofiler_results.to_csv(path_to_results+group+'_'+cell_type+"_all_gprofiler_results.csv") + # Plotting + plt.figure(figsize=figsize, dpi=100) + plt.style.use('default') + sns.scatterplot(data=selected_gps, x="nlog10", y="name", s=marker_size, color=plot_color) + plt.title(f'GO enrichment in {cell_type} ({group})\n(n={len(query_genes)} genes)', fontsize=font_size + 2) + plt.xticks(size=font_size) + plt.yticks(size=font_size) + plt.ylabel("GO Terms", size=font_size) + plt.xlabel("$-log_{10}$ (P-value)", size=font_size) + + # Saving + if save_plot: + go_path.mkdir(parents=True, exist_ok=True) + plt.savefig(go_path / f"{group}_GO_plot.pdf", bbox_inches='tight', facecolor='white') + gprofiler_results.to_csv(go_path / f"{group}_full_GO_results.csv") + + plt.show() -def exploring_specific_genes(cluster_name='cell_type',font_size=24,gene_list=[],fig_size=(64, 56),p_value=0.01,create_new_plot_folder=True,fc_ther=0.5): +def exploring_specific_genes(cluster_name='cell_type', + font_size=24, + gene_list=[], + fig_size=(64, 56), + p_value=0.01, + create_new_plot_folder=True, + fc_ther=0.5): """ Explore specific genes within a cluster to analyze their patterns in comparison to other cell types. @@ -746,14 +804,13 @@ def exploring_specific_genes(cluster_name='cell_type',font_size=24,gene_list=[], Returns: Show the genes for the interested cell types """ - path='Results_PILOT/' - file_name = "/Whole_expressions.csv" - cluster_names = [os.path.splitext(f)[0] for f in listdir(path + '/cells/') \ - if isfile(join(path + '/cells/', f))] + result_path=Path('Results_PILOT') + file_name = "Whole_expressions.csv" + cluster_names = [f.stem for f in (result_path / 'cells').iterdir() if f.is_file()] - all_stats_extend = pd.read_csv(path + "/gene_clusters_stats_extend.csv", sep = ",") + all_stats_extend = pd.read_csv(result_path / "gene_clusters_stats_extend.csv", sep = ",") - with open(path + '/genes_dictionary.pkl', 'rb') as handle: + with open(result_path / 'genes_dictionary.pkl', 'rb') as handle: gene_dict = pickle.load(handle) pline = np.linspace(1, 20, 20) @@ -761,15 +818,14 @@ def exploring_specific_genes(cluster_name='cell_type',font_size=24,gene_list=[], filtered_all_stats_extend=filtered_all_stats_extend[filtered_all_stats_extend['cluster'].isin([cluster_name])] - plot_stats_by_pattern(cluster_names, filtered_all_stats_extend, gene_dict, pline, path, file_name,font_size=font_size,p_value=p_value,create_new_plot_folder=create_new_plot_folder,fc_ther=fc_ther) + plot_stats_by_pattern(cluster_names, filtered_all_stats_extend, gene_dict, pline, result_path, file_name,font_size=font_size,p_value=p_value,create_new_plot_folder=create_new_plot_folder,fc_ther=fc_ther) # Load the PNG image file if create_new_plot_folder: - image_path =path+'/plot_genes_for_'+str(cluster_name)+'/'+str(cluster_name) + ".png" # Replace with the actual path to your PNG image + image_path = result_path+'/plot_genes_for_'+str(cluster_name)+'/'+str(cluster_name) + ".png" # Replace with the actual path to your PNG image image = imread(image_path) - else: - - image_path =path+'plots_gene_cluster_differentiation/'+cluster_name+'.png' # Replace with the actual path to your PNG image + else: + image_path =result_path+'plots_gene_cluster_differentiation/'+cluster_name+'.png' # Replace with the actual path to your PNG image image = imread(image_path) # Set the size of the figure @@ -780,10 +836,20 @@ def exploring_specific_genes(cluster_name='cell_type',font_size=24,gene_list=[], ax.axis('off') # Turn off axis labels and ticks plt.show() - - - -def go_enrichment(df,num_gos=20,source=None,cell_type='cell_type',fontsize=32,s=200, figsize = (15,12),color = 'tab:blue',dpi=100,bbox_inches = 'tight', facecolor='white', transparent=False,organism='hsapiens'): + +def go_enrichment(df, + num_gos=20, + source=None, + cell_type='cell_type', + fontsize=32, + s=200, + figsize = (15,12), + color = 'tab:blue', + dpi=100, + bbox_inches = 'tight', + facecolor='white', + transparent=False, + organism='hsapiens'): """ Perform Gene Ontology (GO) enrichment analysis and create a scatterplot of enriched terms. @@ -823,15 +889,15 @@ def go_enrichment(df,num_gos=20,source=None,cell_type='cell_type',fontsize=32,s= Saves the scatterplot of enriched GO terms as a PDF file. """ - path='Results_PILOT/' + result_path=Path('Results_PILOT/') df_sorted =df gp = GProfiler(return_dataframe=True) gprofiler_results = gp.profile(organism = organism, query = list(df_sorted['gene'].values)) - if not os.path.exists(path+'GO/'+cell_type+'/'): - os.makedirs(path+'GO/'+cell_type+'/') - gprofiler_results.to_csv(path+'GO/'+cell_type+'/'+cell_type+"_all_gprofiler_results.csv") + go_path = result_path / 'GO' / cell_type + go_path.mkdir(parents=True, exist_ok=True) + gprofiler_results.to_csv(result_path+'GO/'+cell_type+'/'+cell_type+"_all_gprofiler_results.csv") if(gprofiler_results.shape[0] < num_gos): num_gos = gprofiler_results.shape[0] @@ -855,13 +921,12 @@ def go_enrichment(df,num_gos=20,source=None,cell_type='cell_type',fontsize=32,s= plt.ylabel("GO Terms", size = fontsize) plt.xlabel("-$log_{10}$ (P-value)", size = fontsize) - #if not os.path.exists(path+'GO/'): - # os.makedirs(path+'GO/') #plt.savefig(path+'GO/'+cell_type+".pdf", bbox_inches = 'tight', facecolor='white', transparent=False) - if not os.path.exists(path+'GO/'+cell_type+'/'): - os.makedirs(path+'GO/'+cell_type+'/') - plt.savefig(path+'GO/'+cell_type+'/'+cell_type+".pdf", bbox_inches = bbox_inches, facecolor=facecolor, transparent=transparent) + save_path = Path(result_path / "GO" / cell_type) + save_path.mkdir(parents=True, exist_ok=True) + plt.savefig(save_path / f"{cell_type}.pdf", bbox_inches = bbox_inches, facecolor=facecolor, transparent=transparent) + def plt_gene_cluster_differentiation(cellnames=['healthy_CM','Myofib'],font_size=22,p_value=0.01,fc_ther=0.5): """ Generate and save plots showcasing gene expression patterns for selected cell clusters. @@ -882,26 +947,32 @@ def plt_gene_cluster_differentiation(cellnames=['healthy_CM','Myofib'],font_size Generates and saves scatterplots of gene expression patterns for selected clusters. """ - file_name = "/Whole_expressions.csv" - path='Results_PILOT/' - all_stats_extend=pd.read_csv(path+'gene_clusters_stats_extend.csv') - with open(path + '/genes_dictionary.pkl', 'rb') as handle: + file_name = "Whole_expressions.csv" + result_path=Path('Results_PILOT') + all_stats_extend=pd.read_csv(result_path / 'gene_clusters_stats_extend.csv') + with open(result_path / 'genes_dictionary.pkl', 'rb') as handle: gene_dict = pickle.load(handle) pline = np.linspace(1, 20, 20) plot_stats_by_pattern(cluster_names=cellnames, all_stats_extend=all_stats_extend, gene_dict=gene_dict, - pline=pline, path_to_results=path, file_name=file_name,font_size=font_size, + pline=pline, path_to_results=result_path, file_name=file_name,font_size=font_size, p_value=p_value,fc_ther=fc_ther) + def qq_plot_gene(target, data, sorted_best, gene_name): """ Generate a QQ plot for a specific gene's performance. - Parameters: - target (pd.DataFrame): The target DataFrame containing gene labels. - data (pd.DataFrame): The data DataFrame containing gene data. - sorted_best (dict): A dictionary containing sorted gene results. - gene_name (str): The name of the gene to create the QQ plot for. + Parameters + ---------- + target : pd.DataFrame + The target DataFrame containing gene labels. + data : pd.DataFrame + The data DataFrame containing gene data. + sorted_best : dict + A dictionary containing sorted gene results. + gene_name : str + The name of the gene to create the QQ plot for. Returns: None: Displays the QQ plot for the specified gene's performance. @@ -926,27 +997,59 @@ def qq_plot_gene(target, data, sorted_best, gene_name): stats.probplot( (best_tf - predictions), dist="norm", plot=pylab) pylab.show() -def plot_best_matches_cell_types(target, data,df,sorted_best, scale_name, min_target=0, max_target=35,num=11,width=25,height=25,xlim=4,point_size=100,color_back=None,fontsize=28,alpha=1,cmap='viridis'): + +def plot_best_matches_cell_types(target, + data, + df, + sorted_best, + scale_name, + min_target=0, + max_target=35, + num=11, + width=25, + height=25, + xlim=4, + point_size=100, + color_back=None, + fontsize=28, + alpha=1, + cmap='viridis'): """ Plot the best-fitted models for cell types. Parameters: - target (pd.DataFrame): The target data for gene activity. - data (pd.DataFrame): The data containing cell type labels. - df (pd.DataFrame): The data frame containing sample information. - sorted_best (dict): A dictionary containing the best-fitted model results for each factor, sorted by R-squared or modified R-squared. - scale_name (str): The name of the scale. - min_target (int): The minimum target value. - max_target (int): The maximum target value. - num (int): Number of models to plot. - width (int): Width of the figure. - height (int): Height of the figure. - xlim (int): X-axis limit. - point_size (int): Size of data points. - color_back (str): Background color of the plot. - fontsize (int): Font size for the titles and labels. - alpha (float): Alpha value for data points. - cmap (str): Colormap for data points. + target : pd.DataFrame + The target data for gene activity. + data : pd.DataFrame + The data containing cell type labels. + df : pd.DataFrame + The data frame containing sample information. + sorted_best : dict + A dictionary containing the best-fitted model results for each factor, sorted by R-squared or modified R-squared. + scale_name : str + The name of the scale. + min_target : int, optional + The minimum target value. + max_target : int, optional + The maximum target value. + num : int, optional + Number of models to plot. + width : int, optional + Width of the figure. + height : int, optional + Height of the figure. + xlim : int, optional + X-axis limit. + point_size : int, optional + Size of data points. + color_back : str, optional + Background color of the plot. + fontsize : int, optional + Font size for the titles and labels. + alpha : float, optional + Alpha value for data points. + cmap : str, optional + Colormap for data points. Returns: None: This function generates the plot but does not return any value. @@ -1029,27 +1132,39 @@ def plot_best_matches_cell_types(target, data,df,sorted_best, scale_name, min_ta j += 1 - - def plot_best_matches(target, data,df, sorted_best, scale_name, plot_color='tab:orange',num=16,width=25,height=25,x_lim=4,fontsize=24,alpha=0.5,cmap='viridis',color_back=None): """ Plot the best-fitted models for different patterns. Parameters: - target (pd.DataFrame): The target data for gene activity. - data (pd.DataFrame): The data containing cell type labels. - df (pd.DataFrame): The data frame containing sample information. - sorted_best (dict): A dictionary containing the best-fitted model results for each factor, sorted by R-squared or modified R-squared. - scale_name (str): The name of the scale. - plot_color (str): Color of the plots. - num (int): Number of models to plot. - width (int): Width of the figure. - height (int): Height of the figure. - x_lim (int): X-axis limit. - fontsize (int): Font size for titles and labels. - alpha (float): Alpha value for data points. - cmap (str): Colormap for data points. - color_back (str): Background color of the plot. + target : pd.DataFrame + The target data for gene activity. + data : pd.DataFrame + The data containing cell type labels. + df : pd.DataFrame + The data frame containing sample information. + sorted_best : dict + A dictionary containing the best-fitted model results for each factor, sorted by R-squared or modified R-squared. + scale_name : str + The name of the scale. + plot_color : str, optional + Color of the plots. Default is 'tab:orange'. + num : int, optional + Number of models to plot. Default is 16. + width : int, optional + Width of the figure. Default is 25. + height : int, optional + Height of the figure. Default is 25. + x_lim : int, optional + X-axis limit. Drfault is 4. + fontsize : int, optional + Font size for titles and labels. Default is 24. + alpha : float, optional + Alpha value for data points. Default is 0.5. + cmap : str, optional + Colormap for data points. Default is 'viridis'. + color_back : str, optional + Background color of the plot. Default is None. Returns: None: This function generates the plot but does not return any value. @@ -1182,22 +1297,27 @@ def plot_best_matches(target, data,df, sorted_best, scale_name, plot_color='tab: counter=counter+1 - - - + def plot_two_genes(adata, sorted_best_WT, sorted_best_KO, gene_name, scale_name, plot_color1 = 'tab:blue', plot_color2 = 'tab:red'): """ Plot the gene activity for two different conditions (e.g., WT and KO) along with their best-fitted models. Parameters: - adata (AnnData): An AnnData object containing the data. - sorted_best_WT (dict): A dictionary containing the best-fitted model results for the wild-type (WT) condition. - sorted_best_KO (dict): A dictionary containing the best-fitted model results for the knock-out (KO) condition. - gene_name (str): The name of the gene for which the activity is plotted. - scale_name (str): The name of the scale. - plot_color1 (str): Color for the WT condition plot (default is 'tab:blue'). - plot_color2 (str): Color for the KO condition plot (default is 'tab:red'). + adata : AnnData + An AnnData object containing the data. + sorted_best_WT : dict + A dictionary containing the best-fitted model results for the wild-type (WT) condition. + sorted_best_KO : dict + A dictionary containing the best-fitted model results for the knock-out (KO) condition. + gene_name : str + The name of the gene for which the activity is plotted. + scale_name : str + The name of the scale. + plot_color1 : str, optional + Color for the WT condition plot. Default is 'tab:blue'. + plot_color2 : str, optional + Color for the KO condition plot. Default is 'tab:red'. Returns: None: This function generates the plot but does not return any value. @@ -1253,12 +1373,18 @@ def plot_one_gene(target, data, sorted_best, gene_name, scale_name, plot_color): Plot the gene activity and its best-fitted model for a single gene. Parameters: - target (pd.DataFrame): A Pandas DataFrame containing the target gene expression data. - data (pd.DataFrame): A Pandas DataFrame containing the data. - sorted_best (dict): A dictionary containing the best-fitted model results for multiple genes. - gene_name (str): The name of the gene for which the activity is plotted. - scale_name (str): The name of the scale. - plot_color (str): Color for the plot. + target : pd.DataFrame + A Pandas DataFrame containing the target gene expression data. + data : pd.DataFrame + A Pandas DataFrame containing the data. + sorted_best : dict + A dictionary containing the best-fitted model results for multiple genes. + gene_name : str + The name of the gene for which the activity is plotted. + scale_name : str + The name of the scale. + plot_color : str + Color for the plot. Returns: None: This function generates the plot but does not return any value. @@ -1297,17 +1423,24 @@ def plot_one_gene(target, data, sorted_best, gene_name, scale_name, plot_color): size=14) ax.set_xlabel(pattern) + def plot_gene(target, data, sorted_best, gene_name, scale_name, plot_color): """ Plot gene expression data along with the best-fitted curve and statistical information. Parameters: - target (pd.DataFrame): A Pandas DataFrame containing gene expression data. - data (pd.DataFrame): A Pandas DataFrame containing additional data such as labels. - sorted_best (dict): A dictionary containing the best-fitted models for different genes. - gene_name (str): The name of the gene to plot. - scale_name (str): The name of the scale for the y-axis. - plot_color (str): The color to use for plotting. + target : pd.DataFrame + A Pandas DataFrame containing gene expression data. + data : pd.DataFrame + A Pandas DataFrame containing additional data such as labels. + sorted_best : dict + A dictionary containing the best-fitted models for different genes. + gene_name : str + The name of the gene to plot. + scale_name : str + The name of the scale for the y-axis. + plot_color : str + The color to use for plotting. Returns: None @@ -1505,7 +1638,7 @@ def plot_gene_specific(target, data, sorted_best, gene_name, scale_name, plot_co ) p.draw() - + def plot_gene_distribtion(target, gene_name): """ Plot the distribution of gene expression for a specific gene. @@ -1527,7 +1660,8 @@ def plot_gene_distribtion(target, gene_name): plt.yticks(size = 12, weight = 'bold') plt.xticks(size = 12, weight = 'bold') plt.show() - + + def plot_gene_density(target, data, sorted_best, gene_name, scale_name, plot_color): """ Plot the density distribution of gene expression for a specific gene. @@ -1555,6 +1689,7 @@ def plot_gene_density(target, data, sorted_best, gene_name, scale_name, plot_col title=gene_name, ylabels=False, colormap=cm.autumn_r) + def plot_pval_rsq_correlation(table, feature1, feature2, show_fit = True, log_transform = False): """ @@ -1608,6 +1743,7 @@ def plot_pval_rsq_correlation(table, feature1, feature2, show_fit = True, log_tr plt.yticks(size = 14) plt.show() + def plot_condition(target, data, sorted_best, condition_type, scale_name, plot_color): """ Plot the best-fitted models based on up or downregulation patterns in gene expression. @@ -1701,7 +1837,8 @@ def plot_condition(target, data, sorted_best, condition_type, scale_name, plot_c j += 1 i += 1 - + + def plot_6_best(target, data, sorted_best, scale_name, plot_color): """ Plot the 6 best-fitted models for each pattern type (linear, quadratic, linear_quadratic) and regulation direction (up, down) in gene expression. @@ -1755,12 +1892,10 @@ def plot_6_best(target, data, sorted_best, scale_name, plot_color): ax.set_xlabel(patt) k += 1 - + def plot_hor_vs_vert(data, subplot, x, y, c, xlabel, ylabel, rotation, tick_bottom, tick_left, title,fontsize=24): - - ''' Plot horizontal and vertical bar charts using Seaborn. @@ -1807,196 +1942,402 @@ def plot_hor_vs_vert(data, subplot, x, y, c, xlabel, ylabel, rotation, ax.tick_params(bottom=tick_bottom, left=tick_left) return None -def volcano_plot(scores, foldchanges, p_values, cell_type, feature1, feature2, fc_thr = 1, pv_thr = 1, - figsize = (20,20), output_path = None,n_p=5,n_n=5,font_size=18, marker='o', - color='w', - markersize=8, - font_weight_legend='normal', - size_legend=12,dpi=100 - - - ): - - """ - Generate a volcano plot to visualize gene expression significance. - Parameters: - scores : pandas Series - A pandas Series containing the expression scores for genes. - foldchanges : array-like - An array-like containing the fold changes for genes. - p_values : pandas Series - A pandas Series containing the p-values for genes. - cell_type : str - The name of the cell type being analyzed. - feature1 : str - The name of the first feature being compared. - feature2 : str - The name of the second feature being compared. - fc_thr : float, optional (default=1) - The threshold for log2FoldChange to determine significance. - pv_thr : float, optional (default=1) - The threshold for negative log10 of p-value to determine significance. - figsize : tuple, optional (default=(15, 15)) - The size of the plot figure. - output_path : str, optional (default=None) - The path to save the output plot. If None, the plot will be displayed. - n_p : int, optional (default=5) - The number of labels that the user wants to show over the plot for positive threshold. - n_n : int, optional (default=5) - The number of labels that the user wants to show over the plot for negative threshold. - font_size : int, optional (default=18) - Font size for the plot. - marker : str, optional (default='o') - Marker style for data points in the plot. - color : str, optional (default='w') - Color for data points in the plot. - markersize : int, optional (default=8) - Marker size for data points in the plot. - font_weight_legend : str, optional (default='normal') - Font weight for legend text. - size_legend : int, optional (default=12) - Font size for legend text. - dpi : int, optional - Dots per inch for the saved plot image. Default is 100. +def map_color( + df, + fc_thr, + pv_thr, + selected_labels_all, + my_pal=None, + feature1=None, + feature2=None, + hue_order=None, + palette=None +): + ''' + Assigns color categories, point shapes, and sizes for volcano plot visualization. + It also identifies "important" genes for specialized shape mapping and calculates point sizes based + on significance. - Returns: - None - """ + Parameters + ---------- + df : pd.DataFrame + The differential expression results. Must contain 'log2FoldChange', + 'nlog10' (negative log10 of the p-value), and 'symbol' columns. + fc_thr : float + The fold change threshold used to define 'higher' and 'lower' expression. + pv_thr : float + The significance threshold (in -log10 scale). + selected_labels_all : list or set + A collection of gene symbols considered "important" to be assigned + a specific shape in the plot. + my_pal : dict, optional + A dictionary mapping feature names to specific hex or named colors. + feature1 : str, optional + The name of the first experimental group (used to pull colors from my_pal). + feature2 : str, optional + The name of the second experimental group (used to pull colors from my_pal). + hue_order : list, optional + Explicit ordering of color categories for the plot legend. + palette : list, optional + List of colors corresponding to the categories in hue_order. - - - df = pd.DataFrame(columns=['log2FoldChange', 'nlog10', 'symbol']) - df['log2FoldChange'] = foldchanges - df['nlog10'] = -np.log10(p_values.values) - df['symbol'] = scores.index.values - - df.replace([np.inf, -np.inf], np.nan, inplace=True) - df.dropna(subset=["nlog10"], how="all", inplace=True) - + Returns + ------- + df : pd.DataFrame + The input DataFrame with added 'color', 'shape', and 'baseMean' columns. + hue_order : list + The final list of categories used for plot coloring. + palette : list + The final list of colors mapped to the hue_order. + ''' + def map_color(row, fc_thrr, pv_thrr): + fc = row["log2FoldChange"] + sig = row["nlog10"] >= pv_thrr + if not sig: + if fc >= fc_thrr: + return "higher" + if fc <= -fc_thrr: + return "lower" + return "no" + if fc >= fc_thrr: + if fc > 2 * fc_thrr: + return "very higher" + return "higher" + if fc <= -fc_thrr: + if fc < -2 * fc_thrr: + return "very lower" + return "lower" + return "mix" - selected_labels = df.loc[ (np.abs(df.log2FoldChange) >= fc_thr) & (df['nlog10'] >= pv_thr)]['symbol'].values - group1_selected_labels = df.loc[ (df.log2FoldChange <= -fc_thr) & (df['nlog10'] >= pv_thr)]['symbol'].values - pd.DataFrame(group1_selected_labels).to_csv(output_path + "/significant_genes_" + str(cell_type) + "_" + str(feature1) + ".csv") - - group2_selected_labels = df.loc[ (df.log2FoldChange >= fc_thr) & (df['nlog10'] >= pv_thr)]['symbol'].values - pd.DataFrame(group2_selected_labels).to_csv(output_path + "/significant_genes_" + str(cell_type) + "_" + str(feature2) + ".csv") - def map_shape(symbol): - if symbol in selected_labels: - return 'important' - return 'not' - - df['color'] = df[['log2FoldChange', 'symbol', 'nlog10']].apply(map_color, fc_thrr = fc_thr, pv_thrr = pv_thr, axis = 1) - df['shape'] = df.symbol.map(map_shape) - df['baseMean'] = df.nlog10*10 + if symbol in selected_labels_all: + return "important" + return "not" + + # Map color + df["color"] = df.apply(lambda r: map_color(r, fc_thrr=fc_thr, pv_thrr=pv_thr), axis=1) + + # Use my_pal if provided, else default palette + if my_pal is not None and feature1 in my_pal and feature2 in my_pal: + color1 = my_pal[feature1] + color2 = my_pal[feature2] + if hue_order is None: + hue_order = ["no", "very higher", "very lower"] + if palette is None: + palette = ["lightgrey", color2, color1] + else: + if hue_order is None: + hue_order = ["no", "very higher", "higher", "mix", "very lower", "lower"] + if palette is None: + palette = [ + "lightgrey", # non‑significant / no + "#d62a2b", # very higher + "#D62A2B7A", # higher (semi‑transparent) + "lightgrey", # mix + "#1f77b4", # very lower + "#1F77B47D" # lower (semi‑transparent) + ] + + # Map shape and baseMean + df["shape"] = df["symbol"].map(map_shape) + df["baseMean"] = df["nlog10"] * 10 + + return df, hue_order, palette + + +def volcano_plot( + # input modes (mutually exclusive) + data=None, # DataFrame mode (provide data, symbol_Col, fc_col, pval_col) + symbol_col=None, + fc_col=None, + pval_col=None, + scores=None, # Series/array mode (provide scores, foldchanges, p_values) + foldchanges=None, + p_values=None, + + # context + cell_type=None, + feature1=None, + feature2=None, + + # thresholds + fc_thr=1.0, + pv_thr=1.0, # on -log10 scale + + # labeling + label_mode="topN", # "all" or "topN" + n_p=5, + n_n=5, + + my_pal=None, # optional dict {feature: color}, otherwise use default palette + hue_order=None, + palette=None, + figsize=(20, 20), + font_size=18, + marker='o', + marker_edge_color='w', + markersize_legend=8, + font_weight_legend='normal', + size_legend=12, + dpi=100, + output_path=None, + save_prefix="Volcano", + save_csv=True, +): + """ + Generate a unified volcano plot from either a DataFrame or Series/array input. - - plt.figure(figsize = figsize, frameon=False, dpi=100) - plt.style.use('default') - - - #plt.xlim(-xlim, xlim) - ax = sns.scatterplot(data = df, x = 'log2FoldChange', y = 'nlog10', - hue = 'color', hue_order = ['no', 'very higher','higher', 'mix', 'very lower', 'lower'], - palette = ['lightgrey', '#d62a2b', '#D62A2B7A', - 'lightgrey', '#1f77b4', '#1F77B47D'], - style = 'shape', style_order = ['not', 'important'], - markers = ['o', 'o'], - size = 'baseMean', sizes = (40, 800) - ) + Input modes: + 1) DataFrame: data + symbol_col + fc_col + pval_col + 2) Series/array: scores, foldchanges, p_values + + Parameters + ---------- + data : pd.DataFrame or None, optional + Input DataFrame containing gene‑level results. If given, symbol_col, fc_col, and pval_col must also be provided. + If None, scores, foldchanges, and p_values must be provided instead. + symbol_col : str or None, optional + Column name in data containing gene symbols. + fc_col : str or None, optional + Column name in data containing log2 fold changes. + pval_col : str or None, optional + Column name in data containing p‑values or adjusted p‑values. + scores : pd.Series or array‑like or None, optional + Index must correspond to gene symbols; used in Series/array mode instead of data. + foldchanges : array‑like or None, optional + Array of log2 fold changes, used in Series/array mode. + p_values : pd.Series or array‑like or None, optional + Array or Series of p‑values (or adjusted p‑values), used in Series/array mode. + cell_type : str or None, optional + Name of the cell type being analyzed, used for CSV and title context. + feature1 : str or None, optional + Name of the first group/condition in the comparison (e.g., reference). + feature2 : str or None, optional + Name of the second group/condition in the comparison (e.g., test). + fc_thr : float, optional + Threshold on log2 fold change (absolute) for significance. Use ±fc_thr. + pv_thr : float, optional + Threshold on -log₁₀(p‑value) for significance. + label_mode : {'all', 'topN'}, optional + How to select labels for display. + 'all': label all points that pass the thresholds. + 'topN': label only the top n_p and n_n most significant points per side. + n_p : int, optional + Number of highest‑significance points on the positive log2FC side to label (used when label_mode == 'topN'). + n_n : int, optional + Number of highest‑significance points on the negative log2FC side to label (used when label_mode == 'topN'). + my_pal : dict or None, optional + Optional color palette as a dict {feature1: color1, feature2: color2}. + If provided and valid, a two‑color scheme is used for the legend instead of the default palette. + hue_order : list of str or None, optional + Order of color categories in the legend; if None, sensible defaults are used. + palette : list of color‑spec or None, optional + Color palette to use for hue categories; if None, a default palette with semi‑transparent colors is used. + figsize : tuple of float, optional + Figure size (width, height) for the plot. + font_size : int, optional + Base font size for titles, axis labels, and tick labels. + marker : str, optional + Marker style for all points (e.g., 'o', 's'). + marker_edge_color : str, optional + Color of the marker edge (often 'w' for white edge). + markersize_legend : int, optional + Size of legend markers for the color legend. + font_weight_legend : str, optional + Font weight for legend text (e.g., 'normal', 'bold'). + size_legend : int, optional + Font size for legend text. + dpi : int, optional + Dots per inch for the saved figure. + output_path : str or None, optional + Directory path where the plot (and optionally CSVs) should be saved. If None, only show the plot. + save_prefix : str, optional + Prefix used when saving the PDF file (e.g., "Volcano"). + save_csv : bool, optional + Whether to save CSV files of significant genes per feature (used when output_path is not None). + """ - ax.axhline(pv_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(-fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') + # Build DataFrame (log2FoldChange, nlog10, symbol) + # ------------------------------------------------------------------ + if data is not None: + if any(v is None for v in (symbol_col, fc_col, pval_col)): + raise ValueError("In DataFrame mode, symbol_col, fc_col, and pval_col must be provided.") + df = pd.DataFrame({ + "log2FoldChange": data[fc_col].values, + "nlog10": -np.log10(data[pval_col].values), + "symbol": data[symbol_col].values + }) + else: + if scores is None or foldchanges is None or p_values is None: + raise ValueError("In Series mode, scores, foldchanges, and p_values must all be provided.") + df = pd.DataFrame({ + "log2FoldChange": np.asarray(foldchanges), + "nlog10": -np.log10(np.asarray(p_values)), + "symbol": np.asarray(scores.index) + }) + + df.replace([np.inf, -np.inf], np.nan, inplace=True) + df.dropna(subset=["nlog10"], how="all", inplace=True) + + # Significance masks, CSV export + # ------------------------------------------------------------------ + sig_mask = (np.abs(df["log2FoldChange"]) >= fc_thr) & (df["nlog10"] >= pv_thr) + selected_labels_all = df.loc[sig_mask, "symbol"].values + + group1_mask = (df["log2FoldChange"] <= -fc_thr) & (df["nlog10"] >= pv_thr) + group2_mask = (df["log2FoldChange"] >= fc_thr) & (df["nlog10"] >= pv_thr) + + group1_selected = df.loc[group1_mask, "symbol"].values + group2_selected = df.loc[group2_mask, "symbol"].values + + if output_path is not None and save_csv: + if cell_type is None: + cell_type = "NA" + if feature1 is None: + feature1 = "group1" + if feature2 is None: + feature2 = "group2" + + pd.DataFrame(group1_selected, columns=["symbol"]).to_csv( + Path(output_path) / f"Significant_genes_{cell_type}_{feature1}.csv", index=False + ) + pd.DataFrame(group2_selected, columns=["symbol"]).to_csv( + Path(output_path) / f"Significant_genes_{cell_type}_{feature2}.csv", index=False + ) + + # Color mapping + df, hue_order, palette = map_color( + df=df, + fc_thr=fc_thr, + pv_thr=pv_thr, + selected_labels_all=selected_labels_all, + my_pal=my_pal, + feature1=feature1, + feature2=feature2, + hue_order=None, # or pass a custom one if needed + palette=None + ) + + + # Label selection + # ------------------------------------------------------------------ + if label_mode == "all": + subset_labels = selected_labels_all + elif label_mode == "topN": + filtered_df = df.loc[df["nlog10"] >= pv_thr] + pos_df = filtered_df.loc[filtered_df["log2FoldChange"] >= fc_thr] + pos_df = pos_df.sort_values(by="nlog10", ascending=False).head(n_p) + neg_df = filtered_df.loc[filtered_df["log2FoldChange"] <= -fc_thr] + neg_df = neg_df.sort_values(by="nlog10", ascending=False).head(n_n) + subset_labels = np.concatenate([pos_df["symbol"].values, neg_df["symbol"].values]) + else: + raise ValueError("label_mode must be 'all' or 'topN'.") + + # Plotting + # ------------------------------------------------------------------ + plt.figure(figsize=figsize, frameon=False, dpi=dpi) + plt.style.use("default") + + ax = sns.scatterplot( + data=df, + x="log2FoldChange", + y="nlog10", + hue="color", + hue_order=hue_order, + palette=palette, + style="shape", + style_order=["not", "important"], + markers=["o", "o"], + size="baseMean", + sizes=(40, 800) + ) + + # Lines + ax.axhline(pv_thr, zorder=0, c="k", lw=2, ls="--") + ax.axvline(fc_thr, zorder=0, c="k", lw=2, ls="--") + ax.axvline(-fc_thr, zorder=0, c="k", lw=2, ls="--") + + # Labels texts = [] - filtered_df = df.loc[df['nlog10'] >= pv_thr] - subset_labels_fold_change_pos = filtered_df.loc[filtered_df['log2FoldChange'] >= fc_thr] - subset_labels_fold_change_pos = subset_labels_fold_change_pos.sort_values(by='nlog10', ascending=False) - subset_labels_fold_change_pos = subset_labels_fold_change_pos.head(n_p)['symbol'].values - - subset_labels_fold_change_neg = filtered_df.loc[filtered_df['log2FoldChange'] <= -fc_thr] - subset_labels_fold_change_neg = subset_labels_fold_change_neg.sort_values(by='nlog10', ascending=False) - subset_labels_fold_change_neg = subset_labels_fold_change_neg.head(n_n)['symbol'].values - # Combine the subsets of genes - subset_labels = np.concatenate([subset_labels_fold_change_pos, subset_labels_fold_change_neg]) for i in range(len(df)): if df.iloc[i].symbol in subset_labels: - if df.iloc[i].nlog10 >= pv_thr and (df.iloc[i].log2FoldChange >= fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = font_size, weight = 'bold', family = 'sans-serif')) - if df.iloc[i].nlog10 >= pv_thr and ( df.iloc[i].log2FoldChange <= -fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = font_size, weight = 'bold', family = 'sans-serif')) - adjust_text(texts) - # for i in range(len(df)): - # if df.iloc[i].symbol in subset_labels: - # if df.iloc[i].nlog10 >= pv_thr and (df.iloc[i].log2FoldChange >= fc_thr): - # texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - # fontsize = 16, weight = 'bold', family = 'sans-serif')) - # if df.iloc[i].nlog10 >= pv_thr and ( df.iloc[i].log2FoldChange <= -fc_thr): - # texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - # fontsize = 16, weight = 'bold', family = 'sans-serif')) - #adjust_text(texts) - - custom_lines = [Line2D([0], [0], marker=marker, color=color, markerfacecolor='#d62a2b', markersize=markersize), - Line2D([0], [0], marker=marker, color=color, markerfacecolor='#1f77b4', markersize=markersize)] - - plt.legend(custom_lines, ['Higher expressions in ' + feature2, 'Higher expressions in ' + feature1],loc = 1, - bbox_to_anchor = (1,1.1), frameon = False, prop = {'weight': font_weight_legend, 'size': size_legend}) - - for axis in ['bottom', 'left']: + if df.iloc[i].nlog10 >= pv_thr and df.iloc[i].log2FoldChange >= fc_thr: + texts.append( + plt.text( + x=df.iloc[i].log2FoldChange, + y=df.iloc[i].nlog10, + s=df.iloc[i].symbol, + fontsize=font_size, + weight="bold", + family="sans-serif", + ) + ) + if df.iloc[i].nlog10 >= pv_thr and df.iloc[i].log2FoldChange <= -fc_thr: + texts.append( + plt.text( + x=df.iloc[i].log2FoldChange, + y=df.iloc[i].nlog10, + s=df.iloc[i].symbol, + fontsize=font_size, + weight="bold", + family="sans-serif", + ) + ) + if texts: + #adjust_text(texts, arrowprops=dict(arrowstyle="->", color='black', lw=1)) + adjust_text(texts) + + # Legend + if feature1 is None: + feature1 = "group1" + if feature2 is None: + feature2 = "group2" + + col_pos = palette[1] if len(palette) > 1 else "#d62a2b" + col_neg = palette[-1] if len(palette) > 1 else "#1f77b4" + + custom_lines = [ + Line2D([0], [0], marker=marker, color=marker_edge_color, + markerfacecolor=col_pos, markersize=markersize_legend), + Line2D([0], [0], marker=marker, color=marker_edge_color, + markerfacecolor=col_neg, markersize=markersize_legend), + ] + plt.legend( + custom_lines, + [f"Higher expressions in {feature2}", f"Higher expressions in {feature1}"], + loc=1, + bbox_to_anchor=(1, 1.1), + frameon=False, + prop={"weight": font_weight_legend, "size": size_legend}, + ) + + # Axis styling + for axis in ["bottom", "left"]: ax.spines[axis].set_linewidth(2) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.tick_params(width=2) + ax.set_ylim(bottom=0) + + plt.title(f"Expression Score\n{feature1} - {feature2}", fontsize=font_size) + plt.xticks(size=font_size, weight="bold") + plt.yticks(size=font_size, weight="bold") + plt.xlabel("$log_{2}$ (Fold Change)", size=font_size) + plt.ylabel("-$log_{10}$ (P-value)", size=font_size) + + # Save + if output_path is not None: + plt.savefig( + Path(output_path) / f"{save_prefix} {feature1}-{feature2} FC.pdf", + dpi=dpi, + bbox_inches="tight", + facecolor="white", + ) - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - - ax.tick_params(width = 2) - - plt.title("Expression Score \n "+feature1+" - "+feature2, fontsize = font_size) - plt.xticks(size = font_size, weight = 'bold') - plt.yticks(size = font_size, weight = 'bold') - plt.xlabel("$log_{2}$ (Fold Change)", size = font_size) - plt.ylabel("-$log_{10}$ (P-value)", size = font_size) - -# plt.savefig(filename, dpi = 100, bbox_inches = 'tight', facecolor = 'white') - plt.savefig(output_path + "/volcano_" + str(feature1) + "-" + str(feature2) + "_FC.pdf", - dpi = dpi, bbox_inches = 'tight', facecolor = 'white') plt.show() - - - -def map_color(a, fc_thrr, pv_thrr): - """ - Map colors based on specified thresholds for Fold Change and p-value. - - Parameters: - a : tuple - A tuple containing log2FoldChange, symbol, and negative log10 of p-value. - fc_thrr : float - The threshold for log2FoldChange to determine different color mappings. - pv_thrr : float - The threshold for negative log10 of p-value to determine different color mappings. - - Returns: - str - A string indicating the color mapping based on the provided thresholds and input values. - """ - log2FoldChange, symbol, nlog10 = a - if log2FoldChange >= fc_thrr and nlog10 >= pv_thrr: - return 'very higher' - elif log2FoldChange <= -fc_thrr and nlog10 >= pv_thrr: - return 'very lower' - elif log2FoldChange >= fc_thrr and nlog10 < pv_thrr: - return 'higher' - elif log2FoldChange <= -fc_thrr and nlog10 < pv_thrr: - return 'lower' - elif abs(log2FoldChange) < fc_thrr and nlog10 >= pv_thrr: - return 'mix' - else: - return 'no' def plot_stats_by_pattern(cluster_names: list = None, all_stats_extend: pd.DataFrame = None, diff --git a/pilotpy/plot/pseudobulk_DE_analysis.py b/pilotpy/plot/pseudobulk_DE_analysis.py index 1846af5..118e0eb 100644 --- a/pilotpy/plot/pseudobulk_DE_analysis.py +++ b/pilotpy/plot/pseudobulk_DE_analysis.py @@ -14,6 +14,7 @@ from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA from matplotlib.lines import Line2D +from pathlib import Path from adjustText import adjust_text from gprofiler import GProfiler @@ -21,46 +22,48 @@ import seaborn as sns import matplotlib.pyplot as plt -import rpy2.robjects as robjects -import rpy2.robjects.numpy2ri -from rpy2.robjects import pandas2ri +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats -from rpy2.rinterface_lib.callbacks import logger as rpy2_logger -import logging -rpy2_logger.setLevel(logging.ERROR) -pandas2ri.activate() +from .ploting import gene_annotation_cell_type_subgroup, volcano_plot -def plot_cell_numbers(adata, proportion_df, +def plot_cell_numbers(adata, + proportion_df, cell_type: str = None, cluster_col: str = "Predicted_Labels", celltype_col: str = "cell_types", sample_col: str = "sampleID", my_pal = None): - """ - + """ Parameters ---------- - adata : TYPE - DESCRIPTION. - proportion_df : TYPE - DESCRIPTION. + adata : AnnData + Annotated data object containing single-cell gene expression and metadata. + proportion_df : pandas.DataFrame + A dataframe indexed by sample IDs that contains group or cluster assignments for each sample. cell_type : str, optional - DESCRIPTION. The default is None. + The specific cell type to filter for from the `celltype_col`. + The default is None. cluster_col : str, optional - DESCRIPTION. The default is "Predicted_Labels". + The column name in `proportion_df` representing the group/cluster + labels used for coloring the bars. The default is "Predicted_Labels". celltype_col : str, optional - DESCRIPTION. The default is "cell_types". + The column name in `adata.obs` that contains cell type annotations. + The default is "cell_types". sample_col : str, optional - DESCRIPTION. The default is "sampleID". + The column name in `adata.obs` representing unique sample identifiers. + The default is "sampleID". my_pal : TYPE, optional - DESCRIPTION. The default is None. + A custom color palette mapping groups to colors. If None, it defaults + to a red/blue scheme for 3 groups or 'tab10' otherwise. + The default is None. Returns ------- None. - + The function generates and displays a matplotlib bar plot. """ copy_cells = adata.obs.copy() @@ -92,121 +95,170 @@ def plot_cell_numbers(adata, proportion_df, handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels] plt.legend(handles, labels, fontsize = 24) plt.show() - + + def compute_pseudobulk_DE( cluster_counts: pd.DataFrame = None, cluster_metadata: pd.DataFrame = None, - group1: str = None, - group2: str = None, - cluster_col: str = None): + group1: str = "Tumor1", + group2: str = "Tumor1", + cluster_col: str = None, + n_cpus: int = 8): """ + Computes Differential Expression using PyDESeq2, mimicking the R DESeq2 pipeline with apeglm LFC shrinkage. + Performs Wald test for the contrast stage_group1_vs_group2 group2 as reference level). + Parameters ---------- - aggr_counts : pd.DataFrame, optional - DESCRIPTION. The default is None. - metadata : pd.DataFrame, optional - DESCRIPTION. The default is None. - cell_type : str, optional - DESCRIPTION. The default is None. + aggr_counts : pd.DataFrame + Pseudobulk counts matrix with rows = samples, columns = genes, non-negative integers. The default is None. + metadata : pd.DataFrame + Sample metadata with 'stage' column containing group1/group2 labels. The default is None. group1 : str, optional - DESCRIPTION. The default is None. + Name of experimental group (numerator of fold change). The default is "Tumor1". group2 : str, optional - DESCRIPTION. The default is None. + Name of reference/control group (denominator of fold change). The default is "Tumor2". n_cpus : int, optional - DESCRIPTION. The default is 8. + Number of CPU cores for dispersion estimation and Wald tests. The default is 8. Returns ------- - my_stat_res : TYPE - DESCRIPTION. + dict + Single key 'stage_{group1}_vs_{group2}' mapping to pandas DataFrame + with columns: 'gene', 'log2FoldChange', 'pvalue', 'padj', etc. + Sorted by adjusted p-value (padj). + + Note + ---- + Expects raw integer counts (DESeq2-style normalization applied internally). + Sets explicit categorical reference: group2 < group1 for consistent + contrast direction matching DESeq2 coefficient naming. """ - # consider DE between two group of interest - my_cluster_metadata = cluster_metadata[ (cluster_metadata[cluster_col] == group1 ) | (cluster_metadata[cluster_col] == group2)] - my_cluster_counts = cluster_counts.loc[my_cluster_metadata.index] + # Filter metadata and counts for two groups of interest + mask = cluster_metadata[cluster_col].isin([group1, group2]) + my_cluster_metadata = cluster_metadata[mask].copy() + + # Ensure counts match filtered metadata + my_cluster_counts = cluster_counts.loc[my_cluster_metadata.index].copy() - R = robjects.r - R('library(SingleCellExperiment)') - R('library(DESeq2)') - R('library(apeglm)') - R('library(tidyverse, verbose = FALSE)') - R.assign('cluster_counts', my_cluster_counts) - R.assign('cluster_metadata', my_cluster_metadata) + my_cluster_metadata["stage"] = pd.Categorical( + #my_cluster_metadata["stage"], + my_cluster_metadata[cluster_col], + categories=[group2, group1], # reference first + ordered=True, + ) try: - - R('dds <- DESeqDataSetFromMatrix(round(t(cluster_counts)), colData = cluster_metadata, design = ~ stage)') - R('rld <- rlog(dds, blind = TRUE)') - R('dds <- DESeq(dds)') - R(' \ - mylist <- list(resultsNames(dds)); \ - for(coef in resultsNames(dds)){ \ - if(coef != "Intercept"){ \ - print(coef); \ - res <- results(dds, name = coef, alpha = 0.05); \ - res <- lfcShrink(dds, coef = coef, res = res, type = "apeglm"); \ - res_tbl <- res %>% data.frame() %>% rownames_to_column(var = "gene") %>% as_tibble() %>% arrange(padj); \ - mylist[[coef]] <- res_tbl; \ - } \ - } \ - ') + dds = DeseqDataSet( + counts=my_cluster_counts.astype(int), + metadata=my_cluster_metadata, + design_factors="stage", + refit_cooks=True, + n_cpus=n_cpus + ) + + # Run DESeq2 pipeline + dds.deseq2() + + # Compute results, apply LFC Shrinkage (apeglm) + stat_res = DeseqStats(dds, + contrast=['stage', group2, group1], + n_cpus=n_cpus) - res = R('''mylist''') - return res - except rpy2.rinterface_lib.embedded.RRuntimeError: + # Executes Wald test and applies shrinkage + stat_res.summary() + + res_df = stat_res.results_df + res_df = res_df.reset_index().rename(columns={'index': 'gene'}) + res_df = res_df.sort_values('padj') + + return {f"stage_{group1}_vs_{group2}": res_df} + + except Exception as e: + print(f"Error during DE analysis: {e}") return None - + + def compute_pseudobulk_PCA( cluster_counts: pd.DataFrame = None, cluster_metadata: pd.DataFrame = None): - """ + Computes regularized log-transformed pseudobulk counts for PCA using PyDESeq2. + Replaces DESeq2's rlog() with PyDESeq2's equivalent regularized transformation. + Parameters ---------- - aggr_counts : pd.DataFrame, optional - DESCRIPTION. The default is None. - metadata : pd.DataFrame, optional - DESCRIPTION. The default is None. - cell_type : str, optional - DESCRIPTION. The default is None. - group1 : str, optional - DESCRIPTION. The default is None. - group2 : str, optional - DESCRIPTION. The default is None. - n_cpus : int, optional - DESCRIPTION. The default is 8. + cluster_counts : pd.DataFrame + Pseudobulk counts matrix (rows = samples, columns = genes). The default is None. + cluster_metadata : pd.DataFrame + Sample metadata with 'stage' column. The default is None. Returns ------- - my_stat_res : TYPE - DESCRIPTION. - + pd.DataFrame + Regularized log-transformed counts (genes x samples), suitable for PCA + Column names = sample names, row names = gene names """ - - # consider DE between two group of interest - - R = robjects.r - R('library(SingleCellExperiment)') - R('library(DESeq2)') - R('library(apeglm)') - R('library(tidyverse, verbose = FALSE)') - R.assign('cluster_counts', cluster_counts) - R.assign('cluster_metadata', cluster_metadata) - try: - - R('dds <- DESeqDataSetFromMatrix(round(t(cluster_counts)), colData = cluster_metadata, design = ~ stage)') - R('rld <- rlog(dds, blind = TRUE)') - R('dds <- DESeq(dds)') - rld = R('''as.data.frame(assay(rld))''') - return rld - except rpy2.rinterface_lib.embedded.RRuntimeError: + # Initialize DeseqDataSet (PyDESeq2 expects samples x genes) + dds = DeseqDataSet( + counts=cluster_counts.astype(int), + metadata=cluster_metadata, + design_factors="stage", + refit_cooks=True + ) + dds.deseq2() + + # Get regularized log-transformed counts + dds.vst() + + rld_df = pd.DataFrame( + dds.layers["vst_counts"].T, + index=cluster_counts.columns, # genes as index (from input) + columns=cluster_counts.index # samples as columns (from input) + ) + return rld_df + + except Exception as e: + print(f"Error during rlog computation: {e}") return None + def plotPCA_subgroups(proportions, deseq2_counts, cell_type, my_pal, cluster_col): + + """ + Performs feature selection via variance thresholding and visualizes + samples using Principal Component Analysis (PCA). + + Parameters + ---------- + proportions : pandas.DataFrame + A dataframe containing metadata or cluster assignments for the samples. + Must be indexed by sample IDs that match the index of `deseq2_counts`. + deseq2_counts : pandas.DataFrame + A dataframe of normalized expression counts (or similar features) + where rows are samples and columns are features (e.g., genes). + cell_type : str + The name of the cell type being analyzed, used primarily for the + plot title. + my_pal : dict + A dictionary mapping cluster/group labels to specific hex colors + or matplotlib color names. + cluster_col : str + The column name in the `proportions` dataframe used to color + the samples in the PCA plot. + + Returns + ------- + None + Displays a PCA scatter plot with explained variance ratios on + the axes and a custom legend. + """ + # consider top variances features selector = VarianceThreshold(0.2) new_deseq2_counts = selector.fit_transform(deseq2_counts) @@ -231,261 +283,52 @@ def plotPCA_subgroups(proportions, deseq2_counts, cell_type, my_pal, cluster_col markerfacecolor=my_pal[k], markersize=15)) ax.legend(handles=legend_elements, loc=1) plt.show() - -def map_color_ps(a, low_fc_thrr, high_fc_thrr, pv_thrr): - log2FoldChange, symbol, nlog10 = a - if log2FoldChange >= high_fc_thrr and nlog10 >= pv_thrr: - return 'very higher' - elif log2FoldChange <= -low_fc_thrr and nlog10 >= pv_thrr: - return 'very lower' - else: - return 'no' - -def volcano_plot_ps(data, symbol, foldchange, p_value, - cell_type, - feature1, - feature2, - low_fc_thr = 1, - high_fc_thr = 1, - pv_thr = 1, - figsize = (20,10), - output_path = None, - my_pal = None, - fontsize: int = 14 - ): - """ - - - Parameters - ---------- - data : TYPE - DESCRIPTION. - symbol : TYPE - DESCRIPTION. - foldchange : TYPE - DESCRIPTION. - p_value : TYPE - DESCRIPTION. - cell_type : TYPE - DESCRIPTION. - feature1 : TYPE - DESCRIPTION. - feature2 : TYPE - DESCRIPTION. - low_fc_thr : TYPE, optional - DESCRIPTION. The default is 1. - high_fc_thr : TYPE, optional - DESCRIPTION. The default is 1. - pv_thr : TYPE, optional - DESCRIPTION. The default is 1. - figsize : TYPE, optional - DESCRIPTION. The default is (20,10). - output_path : TYPE, optional - DESCRIPTION. The default is None. - my_pal : TYPE, optional - DESCRIPTION. The default is None. - fontsize : int, optional - DESCRIPTION. The default is 14. - - Returns - ------- - str - DESCRIPTION. - - """ - - df = pd.DataFrame(columns=['log2FoldChange', 'nlog10', 'symbol']) - df['log2FoldChange'] = data[foldchange] - df['nlog10'] = -np.log10(data[p_value].values) - df['symbol'] = data[symbol].values - - color1 = my_pal[feature1] - color2 = my_pal[feature2] - - df.replace([np.inf, -np.inf], np.nan, inplace=True) - df.dropna(subset=["nlog10"], how="all", inplace=True) - - - selected_labels = df.loc[ (df.log2FoldChange <= low_fc_thr) & (df.log2FoldChange >= high_fc_thr) & \ - (df['nlog10'] >= pv_thr)]['symbol'].values - - def map_shape(symbol): - if symbol in selected_labels: - return 'important' - return 'not' - - df['color'] = df[['log2FoldChange', 'symbol', 'nlog10']].apply(map_color_ps, low_fc_thrr = low_fc_thr, - high_fc_thrr = high_fc_thr, - pv_thrr = pv_thr, axis = 1) - df['shape'] = df.symbol.map(map_shape) - df['baseMean'] = df.nlog10*10 - - - plt.figure(figsize = figsize, frameon=False, dpi=100) - plt.style.use('default') - - ax = sns.scatterplot(data = df, x = 'log2FoldChange', y = 'nlog10', - hue = 'color', hue_order = ['no', 'very higher', 'very lower'], - palette = ['lightgrey', color2, color1], - style = 'shape', style_order = ['not', 'important'], - markers = ['o', 'o'], - size = 'baseMean', sizes = (40, 400) - ) - - ax.axhline(pv_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(high_fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - ax.axvline(-low_fc_thr, zorder = 0, c = 'k', lw = 2, ls = '--') - - texts = [] - for i in range(len(df)): - if df.iloc[i].nlog10 >= pv_thr and (df.iloc[i].log2FoldChange >= high_fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = fontsize, weight = 'bold', family = 'sans-serif')) - if df.iloc[i].nlog10 >= pv_thr and ( df.iloc[i].log2FoldChange <= -low_fc_thr): - texts.append(plt.text(x = df.iloc[i].log2FoldChange, y = df.iloc[i].nlog10, s = df.iloc[i].symbol, - fontsize = fontsize + 2, weight = 'bold', family = 'sans-serif')) - adjust_text(texts) - - custom_lines = [Line2D([0], [0], marker='o', color='w', markerfacecolor=color2, markersize=fontsize), - Line2D([0], [0], marker='o', color='w', markerfacecolor=color1, markersize=fontsize)] - plt.legend(custom_lines, ['Higher expressions in ' + feature2, 'Higher expressions in ' + feature1], loc = 1, - bbox_to_anchor = (1,1.1), frameon = False, prop = {'weight': 'normal', 'size': fontsize}) - for axis in ['bottom', 'left']: - ax.spines[axis].set_linewidth(2) - - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - - ax.tick_params(width = 2) - ax.set_ylim(bottom=0) - plt.title("Expression Score \n " + feature1 + " - " + feature2, fontsize = fontsize + 4) - plt.xticks(size = fontsize, weight = 'bold') - plt.yticks(size = fontsize, weight = 'bold') - - plt.xlabel("$log_{2}$ (Fold Change)", size = fontsize + 2) - plt.ylabel("-$log_{10}$ (P-value)", size = fontsize + 2) - - if output_path is not None: - plt.savefig(output_path + "/volcano_" + str(feature1) + "-" + str(feature2) + "_FC.pdf", - dpi = 100, bbox_inches = 'tight', facecolor = 'white') - plt.show() - -def gene_annotation_cell_type_subgroup(data: pd.DataFrame = None, - symbol: str = 'gene', - sig_col: str = 'significant_gene', - cell_type: str = None, - group: str = None, - sources: str = None, - num_gos: int = 10, - fig_h: int = 6, - fig_w: int = 4, - font_size: int = 14, - max_length:int = 50, - path_to_results: str = None, - my_pal = None - ): +def get_sig_genes(data, + symbol: str, + foldchange: str, + p_value: str, + feature1: str, + feature2: str, + low_fc_thr: float = 1, + high_fc_thr: float = 1, + pv_thr: float = 1): """ - Plot to show the most relative GO terms for specifc cell-type of determind patient sub-group + Identifies and labels significantly differentially expressed genes based on + log2 fold-change and p-value thresholds. Parameters ---------- - data : pd.DataFrame - DESCRIPTION. The default is None. - symbol : str, optional - DESCRIPTION. The default is 'gene'. - sig_col : str, optional - DESCRIPTION. The default is 'significant_gene'. - cell_type : str - DESCRIPTION. The default is None. - group : str - DESCRIPTION. The default is None. - sources : str, optional - DESCRIPTION. The default is None. - num_gos : int, optional - DESCRIPTION. The default is 10. - fig_h : int, optional - DESCRIPTION. The default is 6. - fig_w : int, optional - DESCRIPTION. The default is 4. - font_size : int, optional - DESCRIPTION. The default is 14. - max_length : int, optional - DESCRIPTION. The default is 50. - path_to_results : str, optional - DESCRIPTION. The default is None. - my_pal : TYPE, optional - DESCRIPTION. The default is None. + data : AnnData + Annotated data object containing single-cell gene expression and metadata.. + symbol : str + The column name in `data` containing gene symbols or identifiers. + foldchange : str + The column name in `data` containing log2 fold-change values. + p_value : str + The column name in `data` containing p-values (unadjusted or adjusted). + feature1 : str + The label to assign to significantly down-regulated genes (e.g., "Downregulated"). + feature2 : str + The label to assign to significantly up-regulated genes (e.g., "Upregulated"). + low_fc_thr : float, optional + The absolute log2 fold-change threshold for down-regulation. + Genes must be <= -low_fc_thr. The default is 1. + high_fc_thr : float, optional + The absolute log2 fold-change threshold for up-regulation. + Genes must be >= high_fc_thr. The default is 1. + pv_thr : float, optional + The threshold for significance on the -log10(p-value) scale. + The default is 1 (which corresponds to p < 0.1). Returns ------- - None. - + pandas.DataFrame + The original dataframe with an additional 'significant_gene' column containing + the labels provided in `feature1` and `feature2` for genes passing thresholds. """ -# path_to_results = 'Results_PILOT' - - color = my_pal[group] - -# group_genes = pd.read_csv(path_to_results + \ -# "/significant_genes_" + cell_type + "_" + group + ".csv") - - group_genes = data.loc[data[sig_col] == group, symbol].values - gp = GProfiler(return_dataframe = True) - if list(group_genes): - gprofiler_results = gp.profile(organism = 'hsapiens', - query = list(group_genes), - no_evidences = False, - sources = sources) - else: - return "Genes list is empty!" - - if(gprofiler_results.shape[0] == 0): - return "Not enough information!" - elif(gprofiler_results.shape[0] < num_gos): - num_gos = gprofiler_results.shape[0] - - all_gprofiler_results = gprofiler_results.copy() - # display(all_gprofiler_results.head()) - - # print(len(list(group_genes['symbol'].values))) - # selected_gps = gprofiler_results.loc[0:num_gos,['name', 'p_value']] - selected_gps = gprofiler_results.head(num_gos)[['name', 'p_value']] - - selected_gps['nlog10'] = -np.log10(selected_gps['p_value'].values) - - for i in selected_gps.index: - split_name = "\n".join(tw.wrap(selected_gps.loc[i, 'name'], max_length)) - selected_gps.loc[i, 'name'] = split_name - - figsize = (fig_h, fig_w) - - plt.figure(figsize = figsize, dpi = 100) - plt.style.use('default') - sns.scatterplot(data = selected_gps, x = "nlog10", y = "name", s = 300, color = color) - - plt.title('GO enrichment in ' + cell_type + ' associated with ' + group + \ - '\n (number of genes: ' + str(len(list(group_genes))) + ")", fontsize = font_size + 2) - - plt.xticks(size = font_size) - plt.yticks(size = font_size) - - plt.ylabel("GO Terms", size = font_size) - plt.xlabel("-$log_{10}$ (P-value)", size = font_size) - - save_path = path_to_results + '/' - if not os.path.exists(save_path): - os.makedirs(save_path) -# plt.savefig(save_path + group + ".pdf", bbox_inches = 'tight', -# facecolor = 'white', transparent = False) - plt.show() - - all_gprofiler_results.to_csv(save_path + group + ".csv") - -def get_sig_genes(data, symbol, foldchange, p_value, cell_type, - feature1, feature2, - low_fc_thr = 1, high_fc_thr = 1, pv_thr = 1): df = pd.DataFrame(columns=['log2FoldChange', 'nlog10', 'symbol']) df['log2FoldChange'] = data[foldchange] df['nlog10'] = -np.log10(data[p_value].values) @@ -503,6 +346,7 @@ def get_sig_genes(data, symbol, foldchange, p_value, cell_type, return data + def get_pseudobulk_DE(adata: ad.AnnData, proportion_df: pd.DataFrame, cell_type: str, @@ -513,64 +357,84 @@ def get_pseudobulk_DE(adata: ad.AnnData, cluster_col: str = "Predicted_Labels", remove_samples: list = [], my_pal: dict = None, - path_to_results: str = 'Results_PILOT/', - figsize: tuple = (30, 15), + path_to_results: str = 'Results_PILOT', + figsize: tuple = (15, 15), num_gos: int = 10, fig_h: int = 6, fig_w: int = 4, sources: list = ['GO:CC', 'GO:PB', 'GO:MF'], fontsize: int = 14, - load: bool = False + load: bool = False, + label_mode: str = "all", + n_p: int = 10, + n_n: int = 10 ): """ + Pseudobulk differential expression analysis pipeline using PyDESeq2. + Aggregates single-cell counts to sample-level pseudobulk, performs PCA QC, + runs pairwise DE between clusters, volcano plots, and GO enrichment. - Parameters ---------- adata : ad.AnnData - DESCRIPTION. + Single-cell count matrix and metadata. proportion_df : pd.DataFrame - DESCRIPTION. + Cell type proportions per sample (index = sampleID). cell_type : str - DESCRIPTION. + Cell type/cluster for pseudobulk aggregation and DE. fc_thr : list - DESCRIPTION. + Fold-change thresholds for volcano plots (one per cluster pair). pv_thr : float, optional - DESCRIPTION. The default is 0.05. + Adjusted p-value threshold. The default is 0.05. celltype_col : str, optional - DESCRIPTION. The default is "cell_types". + Column name for cell types in adata.obs. The default is "cell_types". sample_col : str, optional - DESCRIPTION. The default is "sampleID". + Column name for sample IDs in adata.obs. The default is "sampleID". cluster_col : str, optional - DESCRIPTION. The default is "Predicted_Labels". + Column name for cluster labels in proportion_df. The default is "Predicted_Labels". remove_samples : list, optional - DESCRIPTION. The default is []. + Sample IDs to exclude from analysis. The default is []. my_pal : dict, optional - DESCRIPTION. The default is None. + Color palette for clusters. The default is None. path_to_results : str, optional - DESCRIPTION. The default is 'Results_PILOT/'. + Base directory for saving results. The default is 'Results_PILOT/'. figsize : tuple, optional - DESCRIPTION. The default is (30, 15). + Volcano plot figure size. The default is (30, 15). num_gos : int, optional - DESCRIPTION. The default is 10. + Number of top GO terms to plot. The default is 10. fig_h : int, optional - DESCRIPTION. The default is 6. + GO plot height. The default is 6. fig_w : int, optional - DESCRIPTION. The default is 4. + GO plot width. The default is 4. sources : list, optional - DESCRIPTION. The default is ['GO:CC', 'GO:PB', 'GO:MF']. + GO term sources. The default is ['GO:CC', 'GO:PB', 'GO:MF']. fontsize : int, optional - DESCRIPTION. The default is 14. + Plot font size. The default is 14. load : bool, optional - DESCRIPTION. The default is False. + Load precomputed results instead of recomputing. The default is False. + label_mode : {'all', 'topN'}, optional + How to select labels for display. + 'all': label all points that pass the thresholds. + 'topN': label only the top n_p and n_n most significant points per side. + n_p : int, optional + Number of highest‑significance points on the positive log2FC side to label (used when label_mode == 'topN'). + n_n : int, optional + Number of highest‑significance points on the negative log2FC side to label (used when label_mode == 'topN'). Returns ------- - None. - + None + Saves PCA, DE results, volcano plots, and GO analyses to disk. + + Notes + ----- + - Uses PyDESeq2's rlog_norm() for PCA-ready normalized counts + - Performs all pairwise DE between unique clusters in proportion_df[cluster_col] + - Assumes raw counts in adata.X/layers (no normalization applied before aggregation) + - Saves results to: Results_PILOT/Diff_Expressions_Results/{cell_type}/pseudobulk/ """ - + # Color palette for clusters n_clusters = np.unique(proportion_df[cluster_col]) if my_pal is None: if len(n_clusters) == 3: @@ -578,89 +442,106 @@ def get_pseudobulk_DE(adata: ad.AnnData, else: my_pal = dict(zip(n_clusters, sns.color_palette("tab10", len(n_clusters)))) - save_path = path_to_results + "/Diff_Expressions_Results/" + str(cell_type) + "/pseudobulk/" + # Output dir + save_path = Path(path_to_results) / "Diff_Expressions_Results" / str(cell_type) / "pseudobulk" log_pv_thr = -np.log10(pv_thr) + # Plot cell frequency QC print("Plot cells frequency for each sample... ") - plot_cell_numbers(adata, proportion_df, cell_type = cell_type, - cluster_col = cluster_col, celltype_col = celltype_col, - sample_col = sample_col, my_pal= my_pal) + plot_cell_numbers(adata, proportion_df, cell_type=cell_type, + cluster_col=cluster_col, celltype_col=celltype_col, + sample_col=sample_col, my_pal=my_pal) + # Aggregate to pseudobulk and compute rlog-normalized counts if load == False: print("Aggregating the counts and metadata to the sample level...") counts_df = adata.to_df() counts_df[[celltype_col, sample_col]] = adata.obs[[celltype_col, sample_col]].values aggr_counts = counts_df.groupby([celltype_col, sample_col]).sum() - - cluster_counts = aggr_counts.loc[cell_type] cluster_metadata = proportion_df.loc[cluster_counts.index.values] cluster_metadata['stage'] = cluster_metadata[cluster_col].values - # remove unwanted samples - if not (remove_samples is None): + # Remove unwanted samples + if remove_samples: for sample in remove_samples: - if sample in cluster_metadata.index: - cluster_metadata = cluster_metadata.drop(index = sample) - if sample in cluster_counts.index: - cluster_counts = cluster_counts.drop(index = sample) + cluster_metadata = cluster_metadata.drop(index=sample, errors='ignore') + cluster_counts = cluster_counts.drop(index=sample, errors='ignore') cluster_metadata = cluster_metadata.loc[cluster_counts.index] cluster_counts = cluster_counts.loc[:, (cluster_counts != 0).any(axis=0)] - print("Use the median of ratios method for count normalization from DESeq2") - print("Use regularized log transform (rlog) of the normalized counts from DESeq2") + print("Computing rlog-normalized counts using PyDESeq2...") rld = compute_pseudobulk_PCA(cluster_counts, cluster_metadata) if rld is not None: - - if not os.path.exists(save_path): - os.makedirs(save_path) - rld.to_csv(save_path + "rld_PCA.csv") + save_path.mkdir(parents=True, exist_ok=True) + rld.to_csv(save_path / "rld_PCA.csv") else: - rld = pd.read_csv(save_path + "rld_PCA.csv", index_col = 0) + rld = pd.read_csv(save_path / "rld_PCA.csv", index_col=0) + # PCA plot QC deseq2_counts = rld.transpose() print("Plot the first two principal components... ") plotPCA_subgroups(proportion_df, deseq2_counts, cell_type, my_pal, cluster_col) + # Pairwise DE analysis print("Performing the DE analysis... ") j = 0 for groups in itertools.combinations(n_clusters, 2): data = None if load == False: + # Use adapted PyDESeq2 function1_adapted_to_function2 res = compute_pseudobulk_DE(cluster_counts, cluster_metadata, - group1 = groups[0], - group2 = groups[1], - cluster_col = cluster_col) + group1=groups[0], group2=groups[1], + cluster_col=cluster_col) + if res is not None: - with (robjects.default_converter + pandas2ri.converter).context(): - data = robjects.conversion.get_conversion().rpy2py(res[1]) + # Extract single DataFrame + data = list(res.values())[0] - data = get_sig_genes(data, 'gene', 'log2FoldChange', 'padj', cell_type, - groups[0], groups[1], fc_thr[j], fc_thr[j], log_pv_thr) + data = get_sig_genes(data, 'gene', 'log2FoldChange', 'padj', + groups[0], groups[1], fc_thr[j], fc_thr[j], log_pv_thr) - data.to_csv(save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "_DE.csv") + data.to_csv(save_path / f"{str(groups[1])}_VS_{str(groups[0])}_DE.csv") else: - data = pd.read_csv(save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "_DE.csv", index_col = 0) + data = pd.read_csv(save_path / f"{str(groups[1])}_VS_{str(groups[0])}_DE.csv", index_col=0) + # Plot results if data is not None: - print("Plot volcano plot for " + str(groups[1]) + " vs " + str(groups[0])) - volcano_plot_ps(data, 'gene', 'log2FoldChange', 'padj', cell_type, - groups[0], groups[1], fc_thr[j], fc_thr[j], log_pv_thr, figsize = figsize, - output_path = save_path + "/", - my_pal = my_pal, fontsize = fontsize) - - print("Plot GO analysis for " + str(groups[1]) + " vs " + str(groups[0])) - gene_annotation_cell_type_subgroup(data, cell_type = cell_type, group = groups[0], - sources = sources, num_gos = num_gos, - fig_h = fig_h, fig_w = fig_w, font_size = fontsize, - path_to_results = save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "/GOs/", - my_pal = my_pal) - gene_annotation_cell_type_subgroup(data, cell_type = cell_type, group = groups[1], - sources = sources, num_gos = num_gos, - fig_h = fig_h, fig_w = fig_w, font_size = fontsize, - path_to_results = save_path + "/" + str(groups[1]) + "vs" + str(groups[0]) + "/GOs/", - my_pal = my_pal) - j += 1 \ No newline at end of file + print(f"Plot volcano plot for {str(groups[1])} vs {str(groups[0])}") + volcano_plot(data=data, + symbol_col='gene', + fc_col='log2FoldChange', + pval_col='padj', + cell_type=cell_type, + feature1=groups[0], + feature2=groups[1], + fc_thr=fc_thr[j], + pv_thr=log_pv_thr, + label_mode=label_mode, + n_p=n_p, + n_n=n_n, + my_pal=my_pal, + figsize=figsize, + font_size=fontsize, + output_path=save_path + ) + + # GO analysis for both groups + print(f"Plot GO analysis for {str(groups[1])} vs {str(groups[0])}") + Path(save_path / f"{str(groups[1])}vs{str(groups[0])}" / "GOs").mkdir(parents=True, exist_ok=True) + for group in [groups[0], groups[1]]: + print("About to run gene annotation...") + print(cell_type) + gene_annotation_cell_type_subgroup(data=data, + cell_type=cell_type, + group=group, + sources=sources, + num_gos=num_gos, + figsize=(fig_h,fig_w), + font_size=fontsize, + path_to_results=Path(save_path) / f"{str(groups[1])}vs{str(groups[0])}" / "GOs", + my_pal=my_pal) + j += 1 diff --git a/pilotpy/tools/Cell_gene_selection.py b/pilotpy/tools/Cell_gene_selection.py index 43943a0..ffde2b0 100644 --- a/pilotpy/tools/Cell_gene_selection.py +++ b/pilotpy/tools/Cell_gene_selection.py @@ -137,7 +137,7 @@ def calculate_sparsity(data): float: The sparsity of the input data as a decimal value between 0 and 1, where 0 indicates no sparsity (dense) and 1 indicates full sparsity (completely empty). """ non_zero = np.count_nonzero(data) - total_val = np.product(data.shape) + total_val = np.prod(data.shape) sparsity = (total_val - non_zero) / total_val return(sparsity) @@ -797,4 +797,4 @@ def save_data(dictionary, column_names,save_path,name,p_val,pro,gprofil=False): - + \ No newline at end of file diff --git a/pilotpy/tools/patients_sub_clustering.py b/pilotpy/tools/patients_sub_clustering.py index 648d3bc..cf03bad 100644 --- a/pilotpy/tools/patients_sub_clustering.py +++ b/pilotpy/tools/patients_sub_clustering.py @@ -1,268 +1,233 @@ -import pandas as pd import numpy as np -import seaborn as sns -from matplotlib import pyplot as plt -from scipy.stats import ttest_ind -from statsmodels.stats.multitest import multipletests -from adjustText import adjust_text -from matplotlib.lines import Line2D -from gprofiler import GProfiler +import pandas as pd +from pathlib import Path +from pydeseq2.dds import DeseqDataSet +from pydeseq2.ds import DeseqStats + from .Trajectory import * from ..plot.ploting import * - -def extract_cells_from_gene_expression_for_clustering(adata,sample_col,col_cell,cell_list,path_results=None,normalization=True,n_top_genes=2000,highly_variable_genes_=False): - - - """ - Extract and save gene expression data for specific cells for clustering analysis. - Parameters: - adata : AnnData object - An Annotated Data (AnnData) object containing gene expression data. - sample_col : str - The column name in the adata.obs DataFrame containing sample IDs. - col_cell : str - The column name in the adata.obs DataFrame containing cell type labels. - cell_list : list - A list of cell types for which gene expression data will be extracted and saved. - path_results : str or None, optional (default=None) - The path to the directory where the extracted data will be saved as CSV files. - If None, the default path 'Results_PILOT/cells/' will be used. - normalization : bool, optional (default=True) - Whether to normalize the gene expression data by total count and apply log1p transformation. - n_top_genes : int, optional (default=2000) - The number of top highly variable genes to select. Only applicable if highly_variable_genes_ is True. - highly_variable_genes_ : bool, optional (default=False) - Whether to select highly variable genes for analysis. +def check_raw(data=None): + ''' + Checks if input array contains raw gene counts or normalised counts. - Returns: - Gene exp. dataframe - """ - for cell in cell_list: - adata_new = adata[adata.obs[col_cell].isin([cell]),:] - if normalization: - sc.pp.normalize_total(adata_new, target_sum=1e4) - sc.pp.log1p(adata_new) - - if highly_variable_genes_: - - sc.pp.highly_variable_genes(adata_new, n_top_genes=n_top_genes) - # Access the list of highly variable genes - highly_variable_genes = adata_new.var['highly_variable'] - df=adata_new[:,highly_variable_genes].X - df=pd.DataFrame(df.toarray()) - highly_variable_gene_names = adata_new.var_names[np.array(adata_new.var['highly_variable'])] - df.columns=list(highly_variable_gene_names) - else: - - df=adata_new[:,adata_new.var_names].X - df=pd.DataFrame(df.toarray()) - df.columns=adata_new.var_names - - - df['sampleID']=list(adata_new.obs[sample_col]) - - if path_results==None: - if not os.path.exists('Results_PILOT/cells/'): - os.makedirs('Results_PILOT/cells/') - path_results='Results_PILOT/cells/' - else: - path_results=path_results + Parameters + ---------- + data : np.ndarray or None, optional + Array containing gene counts, with shape n_cells x n_genes. - - df.to_csv(path_results+cell+'.csv') - return df -def compute_diff_expressions(adata,cell_type: str = None, + + Returns + ------- + is_int : bool + If True, all values in array are integers. + is_non_negative : bool + If True, no negative values were found in the array. + ''' + is_int = np.all(np.equal(np.floor(data), data)) + is_non_negative = np.all(data >= 0) + return is_int, is_non_negative + + +def filter_genes(raw_counts, gene_names, threshold): + ''' + Filters genes based on their expression frequency across cells. + + Removes genes that are not detected in a minimum fraction of the total + cell population. This helps reduce noise and improves the power of + downstream differential expression analysis. + + Parameters + ---------- + raw_counts : np.ndarray + A 2D array of gene expression counts with shape n_cells x n_genes. + gene_names : pd.Index or np.ndarray + An array or Index containing the names of the genes corresponding + to the columns of raw_counts. + threshold : float + The minimum fraction of cells (between 0 and 1) in which a gene + must be detected (count > 0) to be retained. + + Returns + ------- + raw_counts : np.ndarray + The filtered count matrix containing only the genes that passed + the threshold. + gene_names : pd.Index or np.ndarray + The names of the retained genes, matching the new column + dimensions of raw_counts. + ''' + min_cells = int(threshold * raw_counts.shape[0]) + detected = np.count_nonzero(raw_counts > 0, axis=0) + keep_genes = detected >= min_cells + + print("Total genes found: ", gene_names.shape) + print("Total cell count: ", raw_counts.shape[0]) + print(f"Genes must be found in at least {min_cells} cells.") # Should be 10 + print(f"detected min/max/mean: {detected.min()}/{detected.max()}/{detected.mean():.1f}") + print(f"keep_genes sum: {keep_genes.sum()}/{len(keep_genes)}") + + raw_counts = raw_counts[:, keep_genes] + gene_names = gene_names[keep_genes] + return raw_counts, gene_names + + +def compute_diff_expressions(adata, + cell_type: str = None, proportions: pd.DataFrame = None, - selected_genes: list = None, + selected_genes: list = None, font_size:int=18, group1: str = 'Tumor 1', group2: str = 'Tumor 2', - label_name: str = 'Predicted_Labels', - fc_thr: float = 0.5, - pval_thr: float = 0.01, - sample_col:str='sampleID', - col_cell:str ='cell_types', - path=None, - normalization=False, - n_top_genes=2000, - highly_variable_genes_=True, - number_n=5, - number_p=5, - marker='o', - color='w', - markersize=8, - font_weight_legend='normal', + fc_thr=0.5, + pval_thr=0.05, + exp_thr=0.1, + sample_col='sampleID', + col_cell='cell_types', + shrinkage=False, + number_n=5, + number_p=5, + marker='o', + color='w', + markersize=8, + font_weight_legend='normal', size_legend=12, - figsize=(15,15),dpi=100 - ): - - """ - Using limma R package, lmFit fits a linear model using weighted least squares for each gene. - Comparisons between groups (log fold-changes) are obtained as contrasts of these fitted linear models. - Empirical Bayes smoothing of standard errors (shrinks standard errors - that are much larger or smaller than those from other genes towards the average standard error). + figsize=(15,15), + dpi=100, + **kwargs): + ''' + Computes pseudobulk differential expression analysis for a specific cell type + using PyDESeq2 and generates a volcano plot. Parameters ---------- adata : AnnData - Annotated data matrix. + Annotated data object containing single-cell gene expression and metadata. cell_type : str, optional - Specify cell type name to check its differential expression genes. The default is None. + The specific cell type (found in col_cell) to perform DE analysis on. proportions : pd.DataFrame, optional - Cell types proportions in each sample. The default is None. + Metadata table containing sample IDs and condition labels (Predicted_Labels). selected_genes : list, optional - Specify gene names to be considered for checking their differentiation. + Subset of gene names to restrict the analysis to. If None, all filtered genes are used. font_size : int, optional - Font size for plot labels and legends. The default is 18. + Base font size for the plot text and labels. Default is 18. group1 : str, optional - Name of the first patient sub-group for comparison. The default is 'Tumor 1'. + The name of the first experimental group (e.g., 'Tumor 1'). group2 : str, optional - Name of the second patient sub-group for comparison. The default is 'Tumor 2'. - label_name : str, optional - Name of the column containing the labels of patient sub-groups. The default is 'Predicted_Labels'. + The name of the second experimental group (e.g., 'Tumor 2'). fc_thr : float, optional - Specify the fold change threshold. The default is 0.5. + Log2 Fold Change threshold for significance in the volcano plot. Default is 0.5. pval_thr : float, optional - Specify the adjusted p-value threshold. The default is 0.01. + Adjusted p-value threshold for significance. Default is 0.05. + exp_thr : float, optional + Expression threshold; filters genes expressed in fewer than this fraction of cells. Default is 0.1. sample_col : str, optional - Name of the column containing sample IDs. The default is 'sampleID'. + The column in adata.obs containing sample identifiers. Default is 'sampleID'. col_cell : str, optional - Name of the column containing cell type annotations. The default is 'cell_types'. - path : str, optional - Path to save the results. The default is None. - normalization : bool, optional - Perform gene expression normalization. The default is False. - n_top_genes : int, optional - Number of top variable genes to consider. The default is 2000. - highly_variable_genes_ : bool, optional - Determine highly variable genes. The default is True. + The column in adata.obs containing cell type annotations. Default is 'cell_types'. + shrinkage : bool, optional + If True, applies apeGLM log2 fold change shrinkage. Default is False. number_n : int, optional - The number of labels that the user wants to show over the plot for negative thresholds. The default is 5. + Number of top down-regulated genes to label in the volcano plot. Default is 5. number_p : int, optional - The number of labels that the user wants to show over the plot for positive thresholds. The default is 5. + Number of top up-regulated genes to label in the volcano plot. Default is 5. marker : str, optional - Marker style for the labels in the volcano plot. The default is 'o'. + Marker style for the scatter plot points. Default is 'o'. color : str, optional - Marker color for the labels in the volcano plot. The default is 'w'. + Edge color for the markers in the plot. Default is 'w'. markersize : int, optional - Marker size for the labels in the volcano plot. The default is 8. + Size of the markers used in the legend. Default is 8. font_weight_legend : str, optional - Font weight for legend labels. The default is 'normal'. + Font weight for the legend text. Default is 'normal'. size_legend : int, optional - Font size for legend labels. The default is 12. - figsize: tuple, optional - Figure size. The default is (15,15). + Font size for the legend text. Default is 12. + figsize : tuple, optional + Width and height of the resulting figure in inches. Default is (15, 15). dpi : int, optional - Dots per inch for the saved plot image. Default is 100. + Resolution of the saved figure in dots per inch. Default is 100. + **kwargs : dict + Additional keyword arguments passed to internal functions. Returns ------- None - - Generates and displays a volcano plot of fold changes between two interested patient sub-groups. - Saves a statistical table of each gene. - Saves significantly differentiated genes in each group. - """ - - path_to_result='Results_PILOT' - - if os.path.exists(path_to_result + "/cells/" + cell_type + ".csv"): - - cells = pd.read_csv(path_to_result + "/cells/" + cell_type + ".csv", index_col = 0) - - elif cell_type not in adata.uns: - cells=extract_cells_from_gene_expression_for_clustering(adata,sample_col=sample_col,col_cell=col_cell,cell_list=[cell_type],path_results=path,normalization=normalization,n_top_genes=n_top_genes,highly_variable_genes_=highly_variable_genes_) - #cells = pd.read_csv(path_to_result + "/cells/" + cell_type + ".csv", index_col = 0) - - elif cell_type in adata.uns : - cells=adata.uns[cell_type] - - - - import rpy2.robjects as robjects - import rpy2.robjects.numpy2ri - from rpy2.robjects import pandas2ri - from rpy2.robjects.packages import importr - pandas2ri.activate() - - # prepare data for R - proportions.index = proportions['sampIeD'] - - if selected_genes is None: - selected_genes = cells.iloc[:,1:-1].columns - data = cells[selected_genes] - pred_labels = pd.DataFrame() - pls = proportions.loc[cells['sampleID']] - pred_labels['Predicted_Labels'] = pls[label_name] - pred_labels['sampleID'] = pls['sampIeD'] - - # load R packages and data - R=robjects.r - R('library(limma)') - R.assign('data',data) - R.assign('pred_labels', pred_labels) - R.assign('selected_groups', [group1, group2]) - R('selected_pred_labels <- pred_labels[which(pred_labels$Predicted_Labels %in% selected_groups),]') - R('subresult <- data[row.names(selected_pred_labels),]') - - # delete for memory - del data - del pred_labels - - # run limma - print('run limma lmFit') - R('fit <- limma::lmFit(t(subresult), design = unclass(as.factor(selected_pred_labels$Predicted_Labels)))') - print('run limma eBayes') - R('fit <- limma::eBayes(fit)') - R('res <- limma::topTable(fit, n = 2000)') - R('res <- res[colnames(data), ]') - - # get results - res = R('''res''') - - if not os.path.exists(path_to_result+'/Diff_Expressions_Results/'+cell_type): - os.makedirs(path_to_result+'/Diff_Expressions_Results/'+cell_type) - path_to_result=path_to_result+'/Diff_Expressions_Results/'+cell_type+'/' - res.to_csv(path_to_result + "/diff_expressions_stats_" + cell_type + ".csv") - - pv_thr = -np.log10(pval_thr) - volcano_plot(cells[selected_genes].transpose(), res['logFC'], res['adj.P.Val'], - cell_type, group1, group2, fc_thr, pv_thr, - figsize = figsize, output_path = path_to_result,n_p=number_p,n_n=number_n,font_size=font_size, marker=marker, - color=color, - markersize=markersize, - font_weight_legend=font_weight_legend, - size_legend=size_legend,dpi=dpi) - - - - -def install_r_packages(): - """ - Install R packages using rpy2. - - This function installs the "limma" R package using the "BiocManager" package manager. - - Parameters: - None - - Returns: - None - """ - # Install R packages using rpy2 - import rpy2.robjects as robjects - - robjects.r(''' - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") - ''') - - robjects.r(''' - BiocManager::install("limma") - ''') - - - + The function saves a CSV of differential expression statistics and a + PDF volcano plot to the 'Results_PILOT' directory. + ''' + + result_path = Path('Results_PILOT') + + + # Get gene counts for cell type + cell_mask = adata.obs[col_cell] == cell_type + raw_counts = adata.raw[cell_mask].X if adata.raw is not None else adata[cell_mask].X + gene_names = adata.raw.var_names if adata.raw is not None else adata.var_names + sample_ids = adata.obs.loc[cell_mask, sample_col] + + # Check for raw counts, filter for genes expressed in at least 10% of cells + count_matrix = raw_counts.toarray() if hasattr(raw_counts, 'toarray') else raw_counts + is_int, is_non_negative = check_raw(count_matrix) + if not is_int or not is_non_negative: + raise TypeError("Gene counts seem to be normalised, requires raw gene counts instead.") + + + count_matrix, gene_names = filter_genes(count_matrix, gene_names, threshold=exp_thr) + + pseudobulk = (pd.DataFrame(count_matrix.T, index=gene_names, columns=sample_ids) + .groupby(level=0, axis=1).sum().T) + + # Free up RAM + del count_matrix, raw_counts + + relevant_samples = proportions[proportions['Predicted_Labels'].isin([group1, group2])]['sampleID'] + pseudobulk = pseudobulk.loc[pseudobulk.index.intersection(relevant_samples)].fillna(0).astype(int) + metadata = proportions.loc[pseudobulk.index, 'Predicted_Labels'].to_frame() + + # Filter for selected genes + if selected_genes is not None: + selected_genes = list(set(selected_genes) & set(pseudobulk.columns)) + pseudobulk = pseudobulk[selected_genes] + + # Perform dispersion and log fold-change (LFC) estimation + dds = DeseqDataSet(counts=pseudobulk, metadata=metadata, design_factors='Predicted_Labels') + dds.deseq2() + + # Statistical Analysis: p-value computation using Wald test + stat = DeseqStats(dds, contrast=['Predicted_Labels', group2, group1]) # Normal vs Tumor 1 + stat.summary() + # LFC shrinkage with apeGLM + if shrinkage: + stat.lfc_shrink(coeff=f'Predicted_Labels[T.{group2}]') + + # Save to csv + result_path = result_path / 'Diff_Expressions_Results' / cell_type + result_path.mkdir(parents=True, exist_ok=True) + stat.results_df.to_csv(result_path / "Diff_expressions_stats.csv") + + pv_thr = -np.log10(pval_thr) + + # Volcano Plot for viualisation + volcano_plot( + scores=pseudobulk.T, # Series/array mode: index = gene symbols + foldchanges=stat.results_df['log2FoldChange'], + p_values=stat.results_df['padj'], + cell_type=cell_type, + feature1=group1, + feature2=group2, + fc_thr=fc_thr, + pv_thr=pv_thr, + label_mode="topN", + n_p=number_p, + n_n=number_n, + figsize=figsize, + output_path=result_path, + font_size=font_size, + marker=marker, + marker_edge_color=color, + markersize_legend=markersize, + font_weight_legend=font_weight_legend, + size_legend=size_legend, + dpi=dpi + ) \ No newline at end of file diff --git a/setup.py b/setup.py index ddbb72e..2f44dcb 100644 --- a/setup.py +++ b/setup.py @@ -2,34 +2,35 @@ setup( name='pilotpy', - version='2.0.6', + version='2.0.7', author='Mehdi Joodaki', author_email='judakimehdi@gmail.com', url='https://github.com/CostaLab/PILOT', - python_requires='>=3.11.5,<3.12', + python_requires='>=3.14.3', install_requires=[ - "cycler>=0.11.0,<0.12.0", - "joypy>=0.2.6,<0.3.0", - "leidenalg>=0.10.1,<0.11.0", - "numpy>=1.24.4,<1.25.0", - "matplotlib==3.8.0", - "pandas>=2.0.3,<2.1.0", - "plotly>=5.22.0,<5.23.0", - "plotnine>=0.12.3,<0.13.0", - "pot>=0.9.1,<0.10.0", - "pydiffmap>=0.2.0.1,<0.3.0", - "scanpy>=1.9.5,<1.10.0", - "scikit-learn>=1.3.0,<1.4.0", - "scikit-network>=0.31.0,<0.32.0", - "scipy>=1.11.2,<1.12.0", - "seaborn>=0.12.2,<0.13.0", - "shap>=0.42.1,<0.43.0", - "statsmodels>=0.14.0,<0.15.0", - "elpigraph-python>=0.3.1,<0.4.0", - "adjusttext>=0.8,<0.9", - "gprofiler-official>=1.0.0,<1.1.0", - "gseapy>=1.1.7", - "anndata>=0.11.4,<0.12.0", + "cycler", + "joypy", + "leidenalg", + "numpy", + "matplotlib", + "pandas", + "plotly", + "plotnine", + "pot", + "pydeseq2", + "pydiffmap", + "scanpy", + "scikit-learn", + "scikit-network", + "scipy", + "seaborn", + "shap", + "statsmodels", + "elpigraph-python", + "adjusttext", + "gprofiler-official", + "gseapy", + "anndata", ], packages=find_packages() -) +) \ No newline at end of file