import warnings
warnings.filterwarnings("ignore")


import scanpy as sc
import bbknn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import os
import gc
import diffxpy.api as dxpy
import plotly.express as px
import re


%run ../Scripts/pythonScripts.py


adata = sc.read_h5ad('../Data/scrna_data/rawDataScanpy.h5ad')


adata.shape

(62751, 33694)


#sc.pp.subsample(adata, n_obs=5000, random_state=12345, copy=False)


sc.preprocessing.calculate_qc_metrics(adata, inplace=True)


adata

AnnData object with n_obs × n_vars = 62751 × 33694
    obs: 'batch', 'super_batch', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'n_counts'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'


adata.obs.head()


adata.obs['batch'] #sample label - the data contains 15 separate samples

index
AAACCTGAGCCGGTAA-1-0     Sohni1_und
AAACCTGAGCGATTCT-1-0     Sohni1_und
AAACCTGAGCGTTTAC-1-0     Sohni1_und
AAACCTGAGGACAGAA-1-0     Sohni1_und
AAACCTGAGTCATGCT-1-0     Sohni1_und
                            ...    
TTTGTCACACAGACTT-1-14      Her8_Spc
TTTGTCACAGAGTGTG-1-14      Her8_Spc
TTTGTCAGTTCGGCAC-1-14      Her8_Spc
TTTGTCATCAAACCAC-1-14      Her8_Spc
TTTGTCATCTTCAACT-1-14      Her8_Spc
Name: batch, Length: 62751, dtype: category
Categories (15, object): ['Sohni1_und', 'Sohni2_und', 'Sohni1_I', 'Sohni2_I', ..., 'Her5', 'Her6', 'Her7_Spt', 'Her8_Spc']


adata.var.head()


adata.var['n_cells_by_counts'] #nr of cells showing transcripts of a gene

index
RP11-34P13.3     113
FAM138A            0
OR4F5              1
RP11-34P13.7     635
RP11-34P13.8      12
                ... 
AC233755.2        13
AC233755.1         3
AC240274.1      9434
AC213203.1        15
FAM231B            0
Name: n_cells_by_counts, Length: 33694, dtype: int64


MT = ['MT-' in i for i in adata.var_names] #a vector with True and False to find MT genes
perc_mito = np.sum( adata[:,MT].X, 1 ).A1 / np.sum( adata.X, 1 ).A1
adata.obs['perc_mito'] = perc_mito.copy()


sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', color='perc_mito', 
              title='Transcript vs detected genes coloured by mitochondrial content')


sc.pl.scatter(adata[adata.obs['n_genes_by_counts']<3000], x='total_counts', y='n_genes_by_counts', color='perc_mito',
             title='Transcript vs detected genes coloured by mitochondrial content\nfor <3000 genes')


fig = px.histogram(adata[adata.obs['total_counts']<20000].obs, x='total_counts', nbins=100,
                  title='distribution of total transcripts per cell for <20000 transcripts')
fig.show()


fig = px.histogram(adata.obs, x='total_counts', nbins=100, 
                   title='distribution of total transcripts per cell')
fig.show()


fig = px.histogram(adata.obs, x='n_genes_by_counts', nbins=100, title='distribution of detected genes per cell')
fig.show()


fig = px.histogram(adata.obs, x='perc_mito', nbins=100, title='distribution of mitochondrial content per cell')
fig.show()


fig, ax = plt.subplots(1,1)
ax.set_title('Top genes in terms of percentage of transcripts explained in each cell')
fig = sc.pl.highest_expr_genes(adata, n_top=20, ax=ax)


perc_malat = np.sum( adata[:,'MALAT1'].X, 1 ).A1 / np.sum( adata.X, 1 ).A1
adata.obs['perc_MALAT1'] = perc_malat.copy()


fig = px.histogram(adata.obs, x='perc_MALAT1', nbins=100, title='Distribution of the amount of MALAT1 transcripts in each cell')
fig.show()


px.scatter(data_frame=adata.obs, x='total_counts', y='perc_MALAT1', 
           title='Relationship between amount of MALAT1 gene and transcripts per cell')


sc.preprocessing.filter_cells(adata, max_genes=8000)


sc.preprocessing.filter_cells(adata, min_genes=800)


sc.preprocessing.filter_cells(adata, max_counts=40000)


adata = adata[adata.obs['perc_mito']<0.2].copy()


adata = adata[adata.obs['perc_MALAT1']<0.1].copy()


sc.preprocessing.filter_genes(adata, min_cells=10)


print('There are now', adata.shape[0], 'cells and', adata.shape[1],'genes after filtering')

There are now 49243 cells and 29830 genes after filtering


sc.external.pp.scrublet(adata, #dataset
                        expected_doublet_rate=0.06, #the chosen doublet rate as an estimate to start from
                        random_state=12345)

Automatically set threshold at doublet score = 0.31
Detected doublet rate = 1.8%
Estimated detectable doublet fraction = 58.7%
Overall doublet rate:
	Expected   = 6.0%
	Estimated  = 3.1%


fig = px.histogram(adata.obs, x='doublet_score', title='Distribution of doublet scores per cell')
fig.show()


adata = adata[ adata.obs['doublet_score']<.1 ].copy()


sc.preprocessing.pca(adata, svd_solver='arpack', random_state=12345)
dependentFeatures(adata=adata[adata.obs['batch']=='Guo1'], obs_subset=['total_counts','perc_mito'])


print( list(adata.obs['batch'].cat.categories) )


print( list( adata.obs.columns ) )


sc.preprocessing.pca(adata, svd_solver='arpack', random_state=12345)
dependentFeatures(adata=adata[adata.obs['batch'] == ''], #sample name
                  obs_subset=['total_counts','perc_mito']) #technical features


variance = np.ravel(np.log1p(np.var(adata.X.todense(), axis=0)))
px.histogram(variance, title='Log-variance of each gene')


px.histogram(variance, range_x=(0,1), nbins=1000, title='Log-variance of each gene (zoom up to 1)')


print(f'{sum(variance>0.05)} genes have a variance above 0.05')

14959 genes have a variance above 0.05


# save raw data matrix
adata.layers['raw_counts'] = adata.X.copy()
# TPM normalization
sc.pp.normalize_per_cell(adata)
# matrix logarithmization
sc.pp.log1p(adata)
# most variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=15000)
#scale
sc.pp.scale(adata)
adata.layers['scaled_counts'] = adata.X.copy()


sc.preprocessing.pca(adata, svd_solver='arpack', random_state=123, use_highly_variable=True)
dependentFeatures(adata=adata[adata.obs['batch']=='Guo1'], obs_subset=['total_counts','perc_mito'])


sc.plotting.pca_variance_ratio(adata)


plt.rcParams['figure.figsize'] = (6,6) #reduce figure size
sc.pl.pca(adata, color=['batch','total_counts','SYCP1'])


bbknn.bbknn(adata, n_pcs=15)


sc.tools.umap(adata, random_state=123, n_components=3)


sc.plotting.umap(adata, color=['TNP2'], components=['1,2','1,3','2,3'])


X = adata.obsm['X_umap']
fig = px.scatter_3d(adata.obsm, x=X[:,0], y=X[:,1], z=X[:,2], color=adata.obs['total_counts'], opacity=.5)
fig.show()


X = adata.obsm['X_umap']
fig = px.scatter_3d(adata.obsm, x=X[:,0], y=X[:,1], z=X[:,2], color=np.ravel(adata[:,'TNP2'].X))
fig.show()


sc.plotting.umap(adata, color=['TNP2'], projection='3d', components=['1,2,3'] )


YOUR_NAME = 'WRITE YOUR NAME HERE'
PERPLEXITY = 5
DISTANCE_METRIC = 'euclidean'


adata_sub = sc.pp.subsample(adata, random_state=54321, n_obs=5000, copy=True)


sc.tools.tsne(adata_sub, random_state=54321, perplexity=PERPLEXITY, metric=f'{DISTANCE_METRIC}')


sc.plotting.tsne(adata_sub, color=['batch'], legend_loc=None, frameon=False, title=f'{YOUR_NAME}\nPerplexity={PERPLEXITY} Metrix={DISTANCE_METRIC}')


sc.plotting.tsne(adata_sub, color=['batch'], legend_loc=None, frameon=False, 
                 title=f'{YOUR_NAME}\nPerplexity={PERPLEXITY} Metric={DISTANCE_METRIC}', show=False,  
                 save=f'{YOUR_NAME}Perplexity={PERPLEXITY}Metric={DISTANCE_METRIC}.png')


markers = dict() #make an empty dictionary
### SPERMATOCYTOGENESIS
markers['SpermatogoniaA'] = ['ID4','HMGA1']
markers['SpermatogoniaB'] = ['MKI67','DMRT1','STRA8'] 
markers['SpermatocytesI'] = ['MEIOB','PRSS50','SYCP1','TEX101']
markers['SpermatocytesII'] = ['PIWIL1','ACRV1','SPATA16','CLGN']
### SPERMIOGENESIS
markers['Round.Spt'] = ['SPATA9','SPAM1'] #Round spermatids
markers['Elong.Spt'] = ['PRM1','PRM2'] #Elongated spermatids
### SOMATIC CELLS
markers['Sertoli'] = ['CTSL', 'VIM']
markers['Macroph'] = ['CD163','TYROBP']
markers['Leydig'] = ['CFD']
markers['Endothelial'] = ['CD34']
markers['Myoid'] = ['ACTA2']
markers['Smooth_Muscle'] = ['RGS5']


markers_scores, adata = marker_score(markers, adata)


sc.plotting.umap(adata, color=markers_scores, components=['1,2'], ncols=2, vmax=5, s=30, cmap='Blues')


#leiden clustering at various resolutions
sc.tools.leiden(adata, resolution=1, random_state=12345, key_added='leiden_R1')
sc.tools.leiden(adata, resolution=0.5, random_state=12345, key_added='leiden_R.5')
sc.tools.leiden(adata, resolution=0.25, random_state=12345, key_added='leiden_R.25')
sc.tools.leiden(adata, resolution=0.1, random_state=12345, key_added='leiden_R.1')


sc.plotting.umap(adata, color=['leiden_R1','leiden_R.5','leiden_R.25', 'leiden_R.1'], legend_loc='on data', ncols=2)


#Perform the test on logarithmized data
adata.X = adata.layers['raw_counts'] #raw data
sc.pp.normalize_total(adata) #TPM normalization
sc.pp.log1p(adata) #logarithm
sc.tl.rank_genes_groups(adata, groupby='leiden_R.25', n_genes=50) #Top 50 diff.expressed genes in each cluster
adata.X = adata.layers['scaled_counts'] #Set again the scaled data as standard data matrix

WARNING: adata.X seems to be already log-transformed.
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'


result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
X = pd.DataFrame(
    {group + '_' + key[:1].upper(): result[key][group]
    for group in groups for key in ['names', 'pvals_adj','logfoldchanges']})
X.head() #print only first five lines


X[ ['5_N'] ]


!mkdir -p results_scrna


X.to_csv('./results_scrna/DiffExpression_NoAnnotation.csv', sep=',', index=None)


new_names = {
    '0': 'SpermatogoniaA.1',
    '1': 'SpermatogoniaA.2',
    '2': 'Peritubular_Myoid',
    '3': 'Endothelial',
    '4': 'Leydig',
    '5': 'Elong_Spermatids',
    '6': 'SpermatocitesII',
    '7': 'SpermatogoniaB',
    '8': 'Round_Spermatids.1',
    '9': 'SpermatocitesI',
    '10': 'Round_Spermatids.2',
    '11': 'Macroph',
    '12': 'Smooth_Muscle'
}


adata.obs['spermatogenesis_types'] = rename_clusters(new_names, adata.obs['leiden_R.25'])


plt.rcParams['figure.figsize'] = (12,12)
sc.pl.umap( adata, color=['spermatogenesis_types'], 
           legend_loc='on data', 
           legend_fontsize=16,
           frameon=False,
           size=60,
           add_outline=True,
           ncols=1  
           )


print( list( adata.obs['spermatogenesis_types'].cat.categories ) )


CHOSEN_CLUSTER = ''


#Perform the test on logarithmized data
adata.X = adata.layers['raw_counts']
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.tl.rank_genes_groups(adata, groupby='spermatogenesis_types', n_genes=50)
#Use again the scaled data as standard
adata.X = adata.layers['scaled_counts']


result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
X = pd.DataFrame(
    {group + '_' + key[:1].upper(): result[key][group]
    for group in groups for key in ['names', 'pvals_adj','logfoldchanges']})
X.head()
#export
X.to_csv(f'./results_scrna/{CHOSEN_CLUSTER}.txt', columns=[f'{CHOSEN_CLUSTER}_N'], header=None, index=None)


subdata = adata[ [i not in ['Endothelial','Macroph','Peritubular_Myoid','Smooth_Muscle','Leydig'] 
                  for i in adata.obs['spermatogenesis_types']] ].copy()


subdata.uns['iroot'] = np.flatnonzero(subdata.obs['spermatogenesis_types'] == 'SpermatogoniaA')[1]


sc.tl.dpt(subdata, n_dcs=2)

WARNING: Trying to run `tl.dpt` without prior call of `tl.diffmap`. Falling back to `tl.diffmap` with default parameters.


plt.rcParams['figure.figsize'] = (8,8)
sc.pl.umap( subdata, color=['dpt_pseudotime'],
           legend_loc='right margin', 
           legend_fontsize=16,
           frameon=False,
           size=60,
           add_outline=True,
           ncols=1,
           cmap='Blues'
           )


plt.rcParams['figure.figsize'] = (8,8)
X = subdata.obsm['X_umap']
fig = px.scatter_3d(subdata.obsm, x=X[:,0], y=X[:,1], z=X[:,2], color=subdata.obs['dpt_pseudotime'], 
                    hover_name=subdata.obs['spermatogenesis_types'])
fig.show()


sc.pl.violin(subdata, keys='dpt_pseudotime', groupby='spermatogenesis_types', rotation=90,
             order=['SpermatogoniaA','SpermatogoniaB','SpermatocitesI','SpermatocitesII','Round_Spermatids','Elong_Spermatids'])


adata.obs['dpt_pseudotime'] = np.repeat(-1, adata.shape[0])
adata.obs['dpt_pseudotime'][subdata.obs_names] = subdata.obs['dpt_pseudotime']


#remove subdata as it is no longer used
del subdata
gc.collect()

144124


crypto = sc.read_h5ad('../Data/scrna_data/crypto_azoospermia.h5ad')


#use only genes present in both datasets
var_names = adata.var_names.intersection(crypto.var_names)
adata = adata[:, var_names]
crypto = crypto[:, var_names]


sc.tl.ingest(crypto, adata, obs=['spermatogenesis_types','dpt_pseudotime'])


crypto.uns['spermatogenesis_types_colors'] = adata.uns['spermatogenesis_types_colors']  # fix colors


crypto.obs['dpt_pseudotime'] = np.array(crypto.obs['dpt_pseudotime'])


sc.pl.umap(crypto, color=['spermatogenesis_types', 'dpt_pseudotime'], wspace=0.5,
          title=['atlas-based clustering of azoospermic data', 'atlas-based pseudotimes of azoospermic data'])


merged = sc.AnnData.concatenate(adata, crypto, batch_key='condition', batch_categories=['Healthy','Crypto'])


merged.uns['spermatogenesis_types_colors'] = adata.uns['spermatogenesis_types_colors']  # fix category colors


sc.pl.umap(merged, color=['condition','spermatogenesis_types','dpt_pseudotime'], wspace=0.3, s=30)


fig, (ax1) = plt.subplots(1, 1, figsize=(12,8), gridspec_kw={'wspace':0.5})
ax = sns.violinplot(x="spermatogenesis_types", y="dpt_pseudotime", hue="condition", scale="width", palette="Set2", split=True, ax=ax1,                    
                    data=merged[merged.obs['dpt_pseudotime']>=0].obs[ ['condition', 'dpt_pseudotime', 'spermatogenesis_types'] ], 
                    order=['SpermatogoniaA','SpermatogoniaB','SpermatocitesI', 'SpermatocitesII','Round_Spermatids','Elong_Spermatids'])


del adata, crypto
gc.collect()

28125


merged.write('merged.h5ad')


merged = sc.read('merged.h5ad') #read the data


MT_RP = [('MT-' not in i)and(re.match('^RP[A-Za-z]', i) is None) for i in merged.var_names] #a vector with True and False to find mitocondrial/ribosomal genes
merged = merged[:, MT_RP].copy()


merged.X = merged.layers['raw_counts'].todense().copy()


merged[merged.obs['condition']=='Healthy'].obs['spermatogenesis_types'].value_counts() #healthy sample

SpermatogoniaA       18446
Peritubular_Myoid     4863
Round_Spermatids      4123
Endothelial           3755
Leydig                3357
Elong_Spermatids      3134
SpermatocitesII       3111
SpermatogoniaB        2629
SpermatocitesI        2247
Macroph                933
Smooth_Muscle          408
Name: spermatogenesis_types, dtype: int64


merged[merged.obs['condition']=='Crypto'].obs['spermatogenesis_types'].value_counts()

SpermatogoniaA       652
Round_Spermatids     256
Endothelial          251
Peritubular_Myoid    229
Leydig               214
SpermatocitesII      176
SpermatogoniaB       168
Elong_Spermatids     102
SpermatocitesI        61
Smooth_Muscle         37
Macroph                1
Name: spermatogenesis_types, dtype: int64


sc.pp.log1p(merged)
sc.pp.normalize_per_cell(merged)
sc.pp.highly_variable_genes(merged, n_top_genes=5000)


matrix = merged[:,merged.var['highly_variable']].X.copy()


matrix_bulk, clusters, conditions = pseudobulk_matrix(adata=merged, 
                                          batch_key='batch', 
                                          condition_key='condition', 
                                          cluster_key='spermatogenesis_types')

Guo1
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Guo2
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Guo3
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 1 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her1_Spg
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her2_Spg
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 1 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her3_Spg
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her4
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her5
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
Only 0 cells: skipping
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 1 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her6
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
Only 1 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her7_Spt
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Her8_Spc
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
SAM_1
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
----Crypto
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
SAM_2
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
----Crypto
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
Only 1 cells: skipping
--------Smooth_Muscle
Only 1 cells: skipping
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
SAM_3
----Healthy
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
----Crypto
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
Only 1 cells: skipping
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
Sohni1_I
----Healthy
--------Elong_Spermatids
Only 1 cells: skipping
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Sohni1_und
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Sohni2_I
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping
Sohni2_und
----Healthy
--------Elong_Spermatids
--------Endothelial
--------Leydig
--------Macroph
--------Peritubular_Myoid
--------Round_Spermatids
--------Smooth_Muscle
--------SpermatocitesI
--------SpermatocitesII
--------SpermatogoniaA
--------SpermatogoniaB
----Crypto
--------Elong_Spermatids
Only 0 cells: skipping
--------Endothelial
Only 0 cells: skipping
--------Leydig
Only 0 cells: skipping
--------Macroph
Only 0 cells: skipping
--------Peritubular_Myoid
Only 0 cells: skipping
--------Round_Spermatids
Only 0 cells: skipping
--------Smooth_Muscle
Only 0 cells: skipping
--------SpermatocitesI
Only 0 cells: skipping
--------SpermatocitesII
Only 0 cells: skipping
--------SpermatogoniaA
Only 0 cells: skipping
--------SpermatogoniaB
Only 0 cells: skipping


matrix_bulk.head()


matrix_bulk = matrix_bulk.T
matrix_bulk.head()


import anndata as ad


pbulk = ad.AnnData(matrix_bulk)


pbulk

AnnData object with n_obs × n_vars = 1560 × 12173


sc.pp.filter_genes(pbulk, min_cells=1)


pbulk.obs['condition'] = conditions
pbulk.obs['spermatogenesis_types'] = clusters


sc.pp.normalize_per_cell(pbulk)
sc.pp.log1p(pbulk)


sc.pp.pca(pbulk)


sc.pl.pca(pbulk, color=['condition','spermatogenesis_types'], s=50)


sc.tl.rank_genes_groups(pbulk, groupby="condition", reference='Healthy', n_genes=pbulk.shape[1])

WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var'


DEG_matrix = pseudobulk_extract_DEG(pbulk=pbulk, adata=merged)

---Extracting results
WARNING: adata.X seems to be already log-transformed.
---done


DEG_matrix.head()


pseudobulk_volcano(DEG_matrix, logfold_threshold=2, logpval_threshold=3, plot_size=(800,800))


DEG_matrix.to_csv('./pseudobulk_markers.csv', sep='\t', index=None)

	batch	super_batch	n_genes_by_counts	log1p_n_genes_by_counts	total_counts	log1p_total_counts	pct_counts_in_top_50_genes	pct_counts_in_top_100_genes	pct_counts_in_top_200_genes	pct_counts_in_top_500_genes	n_counts
index
AAACCTGAGCCGGTAA-1-0	Sohni1_und	SohniUnd	61	4.127134	511.0	6.238325	97.847358	100.000000	100.000000	100.000000	511.0
AAACCTGAGCGATTCT-1-0	Sohni1_und	SohniUnd	2127	7.662938	5938.0	8.689296	34.725497	45.368811	56.113169	69.434153	5938.0
AAACCTGAGCGTTTAC-1-0	Sohni1_und	SohniUnd	3768	8.234565	8952.0	9.099744	16.979446	24.005809	33.031725	48.514298	8952.0
AAACCTGAGGACAGAA-1-0	Sohni1_und	SohniUnd	1588	7.370860	4329.0	8.373322	35.458535	48.972049	60.198660	74.867175	4329.0
AAACCTGAGTCATGCT-1-0	Sohni1_und	SohniUnd	618	6.428105	962.0	6.870053	35.654886	46.049896	56.548857	87.733888	962.0

	n_cells_by_counts	mean_counts	log1p_mean_counts	pct_dropout_by_counts	total_counts	log1p_total_counts
index
RP11-34P13.3	113	0.001865	0.001863	99.819923	117.0	4.770685
FAM138A	0	0.000000	0.000000	100.000000	0.0	0.000000
OR4F5	1	0.000016	0.000016	99.998406	1.0	0.693147
RP11-34P13.7	635	0.010805	0.010747	98.988064	678.0	6.520621
RP11-34P13.8	12	0.000191	0.000191	99.980877	12.0	2.564949

	0_N	0_L	1_N	1_L	2_N	2_L	3_N	...	9_L	10_N	10_L	11_N	11_L	12_N	12_P	12_L
0	DNAJB6	2.282377	ZNF428	3.872384	PTGDS	7.960241	B2M	...	5.178188	HMGB4	6.110504	TMSB4X	5.329148	MALAT1	0.000000e+00	3.621542
1	MT-CO2	2.241071	HMGA1	3.608303	IGFBP7	6.262925	TMSB4X	...	4.837116	NUPR2	4.908494	B2M	5.067806	MYL9	2.264533e-293	6.096835
2	LYPLA1	2.567145	RPS12	2.602228	ACTA2	7.921296	HLA-B	...	3.523373	GTSF1L	5.256815	TMSB10	4.364655	TMSB4X	0.000000e+00	3.841542
3	SOX4	3.075519	UTF1	4.280721	APOE	6.474001	HLA-E	...	4.938094	TMCO2	5.390223	TYROBP	10.148253	CALD1	3.657298e-294	4.684049
4	ELAVL2	2.913342	RAC3	4.152008	TMSB4X	4.756633	SPARCL1	...	4.429729	H1FNT	4.618910	CD74	7.801248	MYL6	7.142692e-283	4.176695

	5_N
0	PRM2
1	SPTY2D1-AS1
2	LELP1
3	SMCP
4	CRISP2
5	GAPDHS
6	NUPR2
7	OAZ3
8	MLF1
9	GSG1
10	C10orf62
11	TPPP2
12	PRM1
13	SPATA42
14	ODF2
15	IQCF3
16	PROCA1
17	UBL4B
18	SH3RF2
19	C20orf141
20	CARHSP1
21	CEP170
22	DCUN1D1
23	TCP11
24	GPX4
25	TEX40
26	CCDC91
27	GLUL
28	SMKR1
29	TRIM36
30	C2orf57
31	FAM229B
32	TEX38
33	SPEM1
34	CTD-2568A17.1
35	CXCL16
36	CLPB
37	TSPAN16
38	CCSER2
39	ACTL7A
40	DNAJB8
41	ACTRT2
42	PCP2
43	AQP5
44	TSACC
45	TSSK6
46	ALKBH7
47	RP11-195F19.5
48	SPATA3
49	LINC00467

	Healthy_Guo1_Elong_Spermatids_2	Healthy_Guo1_Elong_Spermatids_4	Healthy_Guo1_Elong_Spermatids_5	Healthy_Guo1_Elong_Spermatids_6	Healthy_Guo1_Elong_Spermatids_7	...	Healthy_Sohni2_und_SpermatogoniaB_0	Healthy_Sohni2_und_SpermatogoniaB_1	Healthy_Sohni2_und_SpermatogoniaB_2	Healthy_Sohni2_und_SpermatogoniaB_3	Healthy_Sohni2_und_SpermatogoniaB_4	Healthy_Sohni2_und_SpermatogoniaB_5	Healthy_Sohni2_und_SpermatogoniaB_6	Healthy_Sohni2_und_SpermatogoniaB_7	Healthy_Sohni2_und_SpermatogoniaB_8	Healthy_Sohni2_und_SpermatogoniaB_9
FAM41C	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.336377	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000
SAMD11	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.265152	1.022664	0.721867	0.000000	0.00000	1.344409	0.496319	1.071796	0.000000	0.624600
NOC2L	0.000000	0.691036	0.711886	0.000000	0.672862	...	9.601676	8.129350	7.046049	8.480846	6.74749	6.766028	5.168615	8.167663	10.312463	7.504243
KLHL17	0.000000	0.000000	0.000000	0.000000	0.818655	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	1.147983	0.000000	0.000000
ISG15	0.725659	0.000000	0.000000	0.500157	1.896606	...	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000

Biological background¶

UMI-based single cell data from microdroplets¶

The raw data in practice¶

Alignment and expression matrix¶

Data analysis¶

Preprocessing¶

Quality Filtering¶

Doublets removal¶

Data Normalization¶

Effect of normalization on technical features¶

Optional task
¶

Dimensionality reduction¶

PCA¶

UMAP projection¶

Exercise: tSNE projection
¶

Clusters Identification¶

Print markers' scores¶

Leiden clustering algorithm¶

Differential Gene expression¶

Cluster assignment¶

Exercise: Gene Enrichment Analysis
¶

Data dynamics¶

Comparisons across different datasets¶

Reference-based annotation¶

Cross-dataset differential expression¶

Wrapping up 🎉 🎉 🎉¶

	Crypto_NAMES	Crypto_PVALS	Crypto_PVALS_ADJ	Crypto_LOGFOLDCHANGES	Healthy_PCT	Crypto_PCT	Crypto_FOLDCHANGES	Crypto_LOGPVALS_ADJ	Crypto_LOGPVALS
0	EEF1G	6.363668e-264	2.582164e-260	8.071482	3.659107	97.345133	269.003601	50.0	50.0
1	MRPS24	1.586329e-244	4.827596e-241	6.896244	2.184827	80.763857	119.117706	50.0	50.0
2	GABARAP	2.589673e-269	1.576204e-265	5.500985	14.155640	97.438286	45.285748	50.0	50.0
3	BSCL2	2.402479e-161	2.658670e-158	4.969571	8.456367	85.700978	31.332123	50.0	50.0
4	RNASEK	1.200985e-166	1.624398e-163	4.696043	3.708037	72.100605	25.920876	50.0	50.0

Biological background¶

UMI-based single cell data from microdroplets¶

The raw data in practice¶

Alignment and expression matrix¶

Data analysis¶

Preprocessing¶

Quality Filtering¶

Doublets removal¶

Data Normalization¶

Effect of normalization on technical features¶

Optional task ¶

Dimensionality reduction¶

PCA¶

UMAP projection¶

Exercise: tSNE projection ¶

Clusters Identification¶

Print markers' scores¶

Leiden clustering algorithm¶

Differential Gene expression¶

Cluster assignment¶

Exercise: Gene Enrichment Analysis ¶

Data dynamics¶

Comparisons across different datasets¶

Reference-based annotation¶

Cross-dataset differential expression¶

Wrapping up 🎉 🎉 🎉¶

Optional task
¶

Exercise: tSNE projection
¶

Exercise: Gene Enrichment Analysis
¶