'''The main class of DigitalCellSorter. The class includes tools for:
1. **Pre-preprocessing** of single cell RNA sequencing data
2. **Quality control**
3. **Batch effects correction**
4. **Cells anomaly score evaluation**
5. **Dimensionality reduction**
6. **Clustering**
7. **Annotation of cell types**
8. **Vizualization**
9. **Post-processing**
'''
import os
import platform
import copy
import multiprocessing
import warnings
multiprocessing.set_start_method('spawn', force=True)
import numpy as np
import pandas as pd
import scipy.stats
import scipy.signal
import scipy.linalg
import scipy.cluster.hierarchy
import scipy.spatial.distance
from scipy.interpolate import UnivariateSpline
import sklearn.metrics
import sklearn.cluster
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralCoclustering
from sklearn.ensemble import IsolationForest
from . import GeneNameConverter, geneLists
from .Combat import combat
from .VisualizationFunctions import VisualizationFunctions
from .GenericFunctions import read, write
from .VisualizationFunctions import cm
[docs]def readMTXdata(dataDir, origin, fileMatrix = None, fileBarcodes = None, fileGenes = None, headerRows = None, sampleName = None, stripGeneVersions = True, saveData = True, dropGeneDuplicates = True, dropCellDuplicates = True):
'''Read MTX format into pandas DataFrame compatible with DCS input format
Parameters
dataDir: str
Path to gene expression counts data
origin: str
Name of the software where the data was generated. Supported options are:
'kb-python' for kallisto-bustools
'cellranger' for cellRanger
fileMatrix: str, Default None
Name of the matrix file
fileBarcodes: str, Default None
Name of the cell barcodes file
fileGenes: str, Default None
Name of the genes file
headerRows: list, Default None
List of rows in matrix file to skip
sampleName: str, Default None
Name of the data sample to include in the batch level
stripGeneVersions: boolean, Default True
Remove ensembl gene version. E.g. "ENSG00000236246.1" --> "ENSG00000236246"
saveData: boolean, Default True
Whether to save data in hdf format. If True then the data is saved to
a compressed hdf at the same location as matrix data
dropGeneDuplicates: boolean, Default True
Whether to remove gene duplicates (keep first)
dropGeneDuplicates: boolean, Default True
Whether to remove barcode duplicates (keep first)
Returns:
pandas.DataFrame
Table that has genes in rows and cells in columns
Usage:
df = readMTX(dataDir='filtered_feature_bc_matrix/', origin='cellranger')
#df = readMTX(dataDir='counts_filtered/', origin='kb-python')
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.prepare(df)
DCS.convert('ensembl', 'hugo')
print(DCS.df_expr)
'''
if origin == 'kb-python':
if headerRows is None:
headerRows = [0, 1, 2]
genes_level = 0
if fileMatrix is None:
fileMatrix='cells_x_genes.mtx'
if fileBarcodes is None:
fileBarcodes='cells_x_genes.barcodes.txt'
if fileGenes is None:
fileGenes='cells_x_genes.genes.txt'
elif origin == 'cellranger':
if headerRows is None:
headerRows = [0, 1, 2]
genes_level = 1
if fileMatrix is None:
fileMatrix='matrix.mtx.gz'
if fileBarcodes is None:
fileBarcodes='barcodes.tsv.gz'
if fileGenes is None:
fileGenes='features.tsv.gz'
df_expr = pd.read_csv(os.path.join(dataDir, fileMatrix), delimiter=' ', index_col=[0,1], header=None, skiprows=headerRows)[2].astype(int)
df_expr = df_expr.unstack(genes_level, fill_value=0)
df_expr.index = pd.read_csv(os.path.join(dataDir, fileGenes), delimiter='\t', index_col=None, header=None)[0].loc[df_expr.index - 1].values
df_expr.columns = pd.read_csv(os.path.join(dataDir, fileBarcodes), delimiter='\t', index_col=0, header=None).index
if stripGeneVersions:
df_expr.index = df_expr.index.str.split('.', expand=True).get_level_values(0)
if dropGeneDuplicates:
df_expr = df_expr.loc[~df_expr.index.duplicated(keep='first')]
if dropCellDuplicates:
df_expr = df_expr.T.loc[~df_expr.T.index.duplicated(keep='first')].T
df_expr.index.name = 'gene'
df_expr.columns.name = 'cell'
if not sampleName is None:
df_expr = pd.concat([df_expr], axis=1, keys=[sampleName], names=['batch'], sort=False)
if saveData:
df_expr.to_hdf(os.path.join(dataDir, 'data.h5'), key='df', mode='a', complevel=4, complib='zlib')
return df_expr
[docs]class DigitalCellSorter(VisualizationFunctions):
'''Class of Digital Cell Sorter with methods for processing single cell
RNA-seq data. Includes analyses and visualization tools.
Parameters:
df_expr: pandas.DataFrame, Defauld None
Gene expression in a form of a table, where genes are rows, and
cells/batches are columns
dataName: str, Default 'dataName'
Name used in output files
geneNamesType: str, Default 'alias'
Input gene name convention
geneListFileName: str, Default None
Name of the marker genes file
mitochondrialGenes: list, Default None
List of mitochondrial genes to use in quality control
sigmaOverMeanSigma: float, Default 0.1
Threshold to consider a gene constant
nClusters: int, Default 10
Number of clusters
nFineClusters: int, Default 3
Number of fine clusters to determine with Spectral Co-clustering routine.
This option is ignored is doFineClustering is False.
doFineClustering: boolean, Default True
Whether to do fine clustering or not
minSizeForFineClustering: int, Default 50
Minimum number of cells required to do fine clustering of a cluster.
This option is ignored is doFineClustering is False.
clusteringFunction: function, Default AgglomerativeClustering
Clustering function to use.
Other options: KMeans, {k_neighbors:40}, etc.
Note: the function should have .fit method and same input and output.
For Network-based clustering pass a dictionary
{'k_neighbors':40, metric:'euclidean', 'clusterExpression':True},
this way the best number of clusters will be determined automatically
nComponentsPCA: int, Default 200
Number of pca components
nSamples_pDCS: int, Default 3000
Number of random samples in distribution for pDCS annotation method
nSamples_Hopfield: int, Default 500
Number of repetitions for Hopfield annotation method
saveDir: str, Default os.path.join('')
Directory for output files
makeMarkerSubplots: boolean, Default False
Whether to make subplots on markers
makePlots: boolean, Default True
Whether to make all major plots
availableCPUsCount: int, Default min(12, os.cpu_count())
Number of CPUs used in pDCS method
zScoreCutoff: float, Default 0.3
Z-Score cutoff when setting expression of a cluster as significant
thresholdForUnknown: float, Default 0.3
Threshold when assigning label "Unknown".
This option is used only with a combination of 2 or more annotation methods
thresholdForUnknown_pDCS: float, Default 0.1
Threshold when assigning label "Unknown" in pDCS method
thresholdForUnknown_ratio: float, Default 0.1
Threshold when assigning label "Unknown" in ratio method
thresholdForUnknown_Hopfield: float, Default 0.1
Threshold when assigning label "Unknown" in Hopfield method
annotationMethod: str, Default 'ratio-pDCS-Hopfield'
Metod to use for annotation of cell types to clusters. Options are:
'pDCS': main DCS voting scheme with null testing
'ratio': simple voting score
'Hopfield': Hopfield Network classifier
'pDCS-ratio': 'pDCS' adjusted with 'ratio'
'pDCS-Hopfield': 'pDCS' adjusted with 'Hopfield'
'ratio-Hopfield': 'ratio' adjusted with 'Hopfield'
'pDCS-ratio-Hopfield': 'pDCS' adjusted with 'ratio' and 'Hopfield'
subclusteringName: str, Default None
Parameter used in for certain labels on plots
doQualityControl: boolean, Default True
Whether to remove low quality cells
doBatchCorrection: boolean, Default False
Whether to correct data for batches
minimumNumberOfMarkersPerCelltype: int, Default 10
Minimum number of markers per cell type to keep that cell type in annotation options
nameForUnknown: str, Default 'Unassigned'
Name to use for clusters where label assignment yielded uncertain results
nameForLowQC: str, Default 'Failed QC'
Name to use for cell that do not pass quality control
layout: str, Default 'TSNE'
Projection layout used in visualization. Options are:
'TSNE': t-SNE layout
L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms.
Journal of Machine Learning Research 15(Oct):3221-3245, 2014.
'PCA': use two largest principal components
'UMAP': use uniform manifold approximation,
McInnes, L., Healy, J., UMAP: Uniform Manifold Approximation and Projection for
Dimension Reduction, ArXiv e-prints 1802.03426, 2018
'PHATE': use potential of heat diffusion for affinity-based transition embedding,
Moon, K.R., van Dijk, D., Wang, Z. et al. Visualizing structure and
transitions in high-dimensional biological data. Nat Biotechnol 37, 1482–1492 (2019).
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df_data = DCS.Clean(df_data)
'''
def __init__(self, df_expr = None, dataName = 'dataName', species = 'Human', geneNamesType = 'alias', geneListFileName = None,
mitochondrialGenes = None, sigmaOverMeanSigma = 0.01, nClusters = 10, nFineClusters = 3, doFineClustering = True,
splitFineClusters = False, subSplitSize = 100, medianScaleFactor = 10000,
minSizeForFineClustering = 50, clusteringFunction = AgglomerativeClustering, nComponentsPCA = 200, nSamples_pDCS = 3 * 10 ** 3,
nSamples_Hopfield = 200, saveDir = os.path.join(''), makeMarkerSubplots = False, availableCPUsCount = min(6, os.cpu_count()),
zScoreCutoff = 0.3, subclusteringName = None, doQualityControl = True, doBatchCorrection = False, makePlots = True,
useUnderlyingNetwork = True, minimumNumberOfMarkersPerCelltype = 10, nameForUnknown = 'Unassigned', nameForLowQC = 'Failed QC',
matplotlibMode = 'Agg', countDepthCutoffQC = 0.5, numberOfGenesCutoffQC = 0.5, mitochondrialGenesCutoffQC = 1.5,
excludedFromQC = None, countDepthPrecutQC = 500, numberOfGenesPrecutQC = 250, precutQC = False, minSubclusterSize = 25,
thresholdForUnknown_pDCS = 0., thresholdForUnknown_ratio = 0., thresholdForUnknown_Hopfield = 0., thresholdForUnknown = 0.2,
layout = 'TSNE', safePlotting = True, HopfieldTemperature = 0.1, annotationMethod = 'ratio-pDCS-Hopfield',
useNegativeMarkers = True, removeLowQualityScores = True, updateConversionDictFile = True, verbose = 1,
random_state = None):
'''Initialization function that is automatically called when an instance on Digital Cell Sorter is created'''
self.verbose = verbose
self.gnc = GeneNameConverter.GeneNameConverter(dictDir=os.path.join(os.path.dirname(__file__)),
species=species, updateConversionDictFile=updateConversionDictFile, verbose=verbose)
self.species = species
self.dataName = dataName
self.saveDir = saveDir
self.safePlotting = safePlotting
self.random_state = random_state
if not df_expr is None:
self.prepare(df_expr)
else:
self._df_expr = None
self.defaultGeneListFileName = 'CIBERSORT_LM22'
self.defaultGeneListsDir = os.path.join(os.path.dirname(__file__), 'geneLists')
self.geneListFileName = geneListFileName
self.mitochondrialGenes = mitochondrialGenes
self.useNegativeMarkers = useNegativeMarkers
self.removeLowQualityScores = removeLowQualityScores
self.geneNamesType = geneNamesType
self.countDepthCutoffQC = countDepthCutoffQC
self.numberOfGenesCutoffQC = numberOfGenesCutoffQC
self.mitochondrialGenesCutoffQC = mitochondrialGenesCutoffQC
self.countDepthPrecutQC = countDepthPrecutQC
self.numberOfGenesPrecutQC = numberOfGenesPrecutQC
self.precutQC = precutQC
self.excludedFromQC = excludedFromQC
self.medianScaleFactor = medianScaleFactor
self.minSubclusterSize = minSubclusterSize
self.sigmaOverMeanSigma = sigmaOverMeanSigma
self.nClusters = nClusters
self.doFineClustering = doFineClustering
self.splitFineClusters = splitFineClusters
self.subSplitSize = subSplitSize
self.nFineClusters = nFineClusters
self.minSizeForFineClustering = minSizeForFineClustering
self.nComponentsPCA = nComponentsPCA
self.zScoreCutoff = zScoreCutoff
self.minimumNumberOfMarkersPerCelltype = minimumNumberOfMarkersPerCelltype
self.useUnderlyingNetwork = useUnderlyingNetwork
self.annotationMethod = annotationMethod
self.nameForUnknown = nameForUnknown
self.nameForLowQC = nameForLowQC
self.layout = layout
self.thresholdForUnknown_pDCS = thresholdForUnknown_pDCS
self.thresholdForUnknown_ratio = thresholdForUnknown_ratio
self.thresholdForUnknown_Hopfield = thresholdForUnknown_Hopfield
self.thresholdForUnknown = thresholdForUnknown
self.HopfieldTemperature = HopfieldTemperature
self.nSamples_pDCS = nSamples_pDCS
self.nSamples_Hopfield = nSamples_Hopfield
self.availableCPUsCount = availableCPUsCount
self.clusteringFunction = clusteringFunction
self.subclusteringName = subclusteringName
self.doQualityControl = doQualityControl
self.toggleRemoveLowQualityCells = doQualityControl
self.toggleDoBatchCorrection = doBatchCorrection
self.toggleMakeHistogramNullDistributionPlot = False
self.toggleMakeVotingResultsMatrixPlot = makePlots
self.toggleMakeMarkerExpressionPlot = makePlots
self.toggleMakeProjectionPlotsQC = makePlots
self.toggleMakeProjectionPlotClusters = makePlots
self.toggleMakeProjectionPlotAnnotatedClusters = makePlots
self.toggleMakeProjectionPlotBatches = makePlots
self.toggleMakeStackedBarplot = makePlots
self.toggleAnomalyScoresProjectionPlot = False
self.toggleGetMarkerSubplots = makeMarkerSubplots
super().__init__(saveDir=saveDir, dataName=dataName, safePlotting=safePlotting, matplotlibMode=matplotlibMode, verbose=verbose)
return None
@property
def saveDir(self):
return self._saveDir
@saveDir.setter
def saveDir(self, value):
self._saveDir = value
if self.saveDir != os.path.join('') and not os.path.exists(self.saveDir):
os.makedirs(self.saveDir)
self._fileHDFpath = os.path.join(self.saveDir, self.dataName + '_processed.h5')
return
@property
def fileHDFpath(self):
return self._fileHDFpath
@fileHDFpath.setter
def fileHDFpath(self, value):
warnings.warn("Direct change of HDF file path is not allowed. " +
"Modify 'saveDir' in order to change 'fileHDFpath'", UserWarning)
return
@property
def df_expr(self):
if self._df_expr is None:
if self.verbose >= 1:
print('Expression data is not assigned')
return self._df_expr
@df_expr.setter
def df_expr(self, value):
if value is None:
self._df_expr = value
else:
warnings.warn("Direct change of 'df_expr' is not allowed. " +
"Use function 'prepare' in order to set 'df_expr' " +
"or, alternatively, initialize class instance with 'df_expr'. " +
"'df_expr' can be removed by assigning None to it", UserWarning)
return
@property
def geneListFileName(self):
return self._geneListFileName
@geneListFileName.setter
def geneListFileName(self, value):
if value is None:
self._geneListFileName = os.path.join(self.defaultGeneListsDir, self.defaultGeneListFileName + '.xlsx')
else:
if os.path.isfile(value):
self._geneListFileName = value
else:
if os.path.isfile(os.path.join(self.defaultGeneListsDir, value + '.xlsx')):
self._geneListFileName = os.path.join(self.defaultGeneListsDir, value + '.xlsx')
else:
if self.verbose >= 1:
print('Gene list file not found. Uning default list: %s'%(self.defaultGeneListFileName), flush=True)
self._geneListFileName = os.path.join(self.defaultGeneListsDir, self.defaultGeneListFileName + '.xlsx')
return
# Primary functions of class #######################################################
[docs] def prepare(self, obj):
'''Prepare pandas.DataFrame for input to function process()
If input is pd.DataFrame validate the input whether it has correct structure.
Parameters:
obj: str, pandas.DataFrame, pandas.Series
Expression data in a form of pandas.DataFrame, pandas.Series, or name and path to a csv file with data
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
dDCS.preapre('data.csv')
'''
if type(obj) is pd.Series:
if self.verbose >= 1:
print('Received data in a form of pandas.Series', flush=True)
print('Validating it and converting it to pandas.DataFrame', flush=True)
if not 'cell' in obj.index.names:
if self.verbose >= 1:
print('Column "cell" not found. Returning None', flush=True)
return None
if not 'gene' in obj.index.names:
if self.verbose >= 1:
print('Column "gene" not found. Returning None', flush=True)
return None
if not 'batch' in obj.index.names:
print('Column "batch" not found. Assuming one batch in the data.', flush=True)
batch = np.array(['batch0'] * len(obj.index.get_level_values('cell')))
else:
batch = obj.index.get_level_values('batch')
obj.index = pd.MultiIndex.from_arrays([batch, obj.index.get_level_values('cell'), obj.index.get_level_values('gene')], names=['batch', 'cell', 'gene'])
obj = obj.unstack(level='gene').T
self._df_expr = obj
if self.verbose >= 2:
print('Done', flush=True)
return None
elif type(obj) is pd.DataFrame:
if self.verbose >= 1:
print('Received data in a form of pandas.DataFrame', flush=True)
print('Validating pandas.DataFrame', flush=True)
try:
obj.index.name = 'gene'
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('DataFrame index format is not understood. Returning None', flush=True)
return None
if len(obj.columns.names) > 1:
if not 'cell' in obj.columns.names:
if self.verbose >= 1:
print('Columns level "cell" not found. Returning None', flush=True)
return None
else:
obj.columns.names = ['cell']
if not 'batch' in obj.columns.names:
if self.verbose >= 2:
print('Columns level "batch" not found. Assuming one batch in the data.', flush=True)
batch = np.array(['batch0'] * len(obj.columns.get_level_values('cell')))
else:
batch = obj.columns.get_level_values('batch')
obj.columns = pd.MultiIndex.from_arrays([batch, obj.columns.get_level_values('cell')], names=['batch', 'cell'])
self._df_expr = obj
if self.verbose >= 2:
print('Done', flush=True)
return None
elif type(obj) is str:
columns = pd.read_csv(obj, header=0, index_col=None, nrows=5).columns.values.tolist()
if ('cell' in columns) and ('gene' in columns) and ('expr' in columns):
if self.verbose >= 1:
print('Received data in a form of condensed matrix. Reading data', flush=True)
df_expr = pd.read_csv(obj, header=0, index_col=None)
if self.verbose >= 2:
print('Converting it to pandas.DataFrame')
if not 'cell' in df_expr.columns:
if self.verbose >= 1:
print('The column with "cell" identifiers is not found. Returning None', flush=True)
return None
if not 'gene' in df_expr.columns:
if self.verbose >= 1:
print('The column with "gene" identifiers is not found. Returning None', flush=True)
return None
if not 'expr' in df_expr.columns:
if self.verbose >= 1:
print('The column with expression values is not found. Returning None', flush=True)
return None
if not 'batch' in df_expr.columns:
if self.verbose >= 2:
print('The column with "batch" identifiers is not found. Assuming one batch in the data', flush=True)
df_expr['batch'] = np.array(['batch0'] * len(df_expr))
self._df_expr = df_expr.set_index(['batch', 'cell', 'gene'])['expr'].unstack(level='gene').T
if self.verbose >= 2:
print('Done', flush=True)
return None
else:
if self.verbose >= 1:
print('Received data in a form of matrix. Reading data', flush=True)
df_expr = pd.read_csv(obj, header=None, index_col=0)
if self.verbose >= 2:
print('Converting it to pandas.DataFrame', flush=True)
if not 'cell' in df_expr.index:
if self.verbose >= 1:
print('The row with "cell" identifiers is not found. Returning None', flush=True)
return None
if not 'batch' in df_expr.index:
if self.verbose >= 2:
print('The row with "batch" identifiers is not found. Assuming one batch in the data', flush=True)
df_expr.loc['batch'] = np.array(['batch0'] * df_expr.shape[1])
df_expr = df_expr.T.set_index(['batch', 'cell']).T
df_expr.index.name = 'gene'
self._df_expr = df_expr
if self.verbose >= 2:
print('Done', flush=True)
return None
else:
if self.verbose >= 1:
print('Unknown input data format. Returning None', flush=True)
return None
return None
[docs] def convert(self, nameFrom = None, nameTo = None, **kwargs):
'''Convert index to hugo names, if any names in the index are
duplicated, remove duplicates
Parameters:
nameFrom: str, Default 'alias'
Gene name type to convert from
nameTo: str, Default 'hugo'
Gene name type to convert to
Any parameters that function 'mergeIndexDuplicates' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.convertIndex()
'''
if nameFrom is None:
nameFrom = self.geneNamesType
if nameTo is None:
nameTo = 'hugo'
if nameTo != nameFrom:
self._df_expr.index = self.gnc.Convert(list(self._df_expr.index), nameFrom, nameTo, returnUnknownString=False)
self._df_expr = self.mergeIndexDuplicates(self.df_expr, verbose=self.verbose, **kwargs)
return None
[docs] def clean(self):
'''Clean pandas.DataFrame: validate index, remove index duplicates,
replace missing with zeros, remove all-zero rows and columns
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.clean()
'''
# Replace any NaN(s) with zeros.
self._df_expr.fillna(0.0, inplace=True)
if self.verbose >= 1:
print('Replaced missing values with zeros. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
# Check is any names in the index are duplicated, remove duplicates
self._df_expr = self.mergeIndexDuplicates(self._df_expr)
# Keep only cells with at least one expressed gene.
self._df_expr = self._df_expr.T[self._df_expr.sum(axis=0) > 0].T
if self.verbose >= 1:
print('Removed all-zero cells. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
# Keep only genes expressed in at least one cell.
self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]
if self.verbose >= 1:
print('Removed all-zero genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
return None
[docs] def normalize(self, median = None):
'''Normalize pandas.DataFrame: rescale all cells, log-transform data,
remove constant genes, sort index
Parameters:
median: float, Default None
Scale factor, if not provided will be computed as median across all cells in data
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.normalize()
'''
# Scale all cells.
median = np.median(np.sum(self._df_expr,axis=0)).astype(float) if median is None else median
if self.verbose >= 1:
print('Rescaling all cells by "sum of values = %s".' % (median), flush=True)
self._df_expr = self._df_expr.apply(lambda q: q * median / np.sum(q),axis=0)
if self.verbose >= 1:
print('Log-transforming data.', flush=True)
# Replace zeros with minimum value.
MIN = np.min(self._df_expr.values[self._df_expr.values > 0.])
if MIN <= 0.:
raise ValueError
self._df_expr = self._df_expr.replace(0., MIN)
# Take log2 of expression.
self._df_expr = np.log2(self._df_expr)
self._df_expr -= np.min(self._df_expr.values)
# Keep only genes expressed in at least one cell.
self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]
if self.verbose >= 1:
print('Removed all-zero genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
if not self.sigmaOverMeanSigma is None:
# Keep only those genes with large enough standard deviation.
self._df_expr = self._df_expr[np.std(self._df_expr, axis=1) / np.mean(np.std(self._df_expr.values)) > self.sigmaOverMeanSigma]
if self.verbose >= 1:
print('Removed constant genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
# Sort rows by gene name
self._df_expr = self._df_expr.sort_index()
return None
[docs] def project(self, PCAonly = False, do_fast_tsne = True):
'''Project pandas.DataFrame to lower dimensions
Parameters:
PCAonly: boolean, Default False
Perform Principal component analysis only
do_fast_tsne: boolean, Default True
Do FI-tSNE instead of "exact" tSNE
This option is ignored if layout is not 'TSNE'
Returns:
tuple
Processed data
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
xPCA, PCs, tSNE = DCS.project()
'''
if self.verbose >= 1:
print('Preparing xpca, PCs, 2D projection of df_expr', flush=True)
if self.verbose >= 2:
print('Performing PC projection from %s to %s features...' % (self._df_expr.shape[0], self.nComponentsPCA), flush=True)
_PCA = PCA(n_components=self.nComponentsPCA, random_state=self.random_state)
idx = np.argsort(np.var(self._df_expr.values.T, axis=0) / np.mean(self._df_expr.values.T, axis=0))[-2000:]
X_pca = _PCA.fit_transform(self._df_expr.values.T[:, idx]).T
if self.verbose >= 2:
print('Explained variance:', np.round(np.sum(_PCA.explained_variance_ratio_) * 100., 2), "%", flush=True)
print('Recording xpca, PCs, 2D projection of df_expr', flush=True)
columns=self._df_expr.columns
columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'), columns.get_level_values('cell')], names=['batch', 'cell'])
pd.DataFrame(data=X_pca, columns=columns).to_hdf(self.fileHDFpath, key='df_xpca', mode='a', complevel=4, complib='zlib')
pd.DataFrame(data=_PCA.components_).to_hdf(self.fileHDFpath, key='df_pcs', mode='a', complevel=4, complib='zlib')
if not PCAonly:
if self.layout == 'TSNE':
if X_pca.T.shape[0] < 2000:
do_fast_tsne = False
if self.verbose >= 1:
print('Performing tSNE projection from %s to %s features...' % (self.nComponentsPCA,2), flush=True)
if do_fast_tsne:
if platform.system() == "Windows":
from .FastTSNE import fast_tsne
if self.verbose >= 3:
print('Windows detected. Using FastTSNE submodule wrapper', flush=True)
X_projection2 = fast_tsne(X_pca.T, perplexity = 30, seed=42).T
else:
try:
import fitsne
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('FI-tSNE package could not be imported. Install the package following \
instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
self.layout = 'PCA'
if self.layout == 'TSNE':
X_projection2 = fitsne.FItSNE(np.array(X_pca.T, order='C')).T
else:
X_projection2 = TSNE(n_components=2).fit_transform(X_pca.T).T
elif self.layout == 'UMAP':
if self.verbose >= 1:
print('Performing UMAP projection from %s to %s features...' % (self.nComponentsPCA, 2), flush=True)
try:
import umap
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('UMAP package could not be imported. Install the package following \
instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
self.layout = 'PCA'
if self.layout == 'UMAP':
X_projection2 = umap.UMAP(random_state=self.random_state).fit_transform(X_pca.T).T
elif self.layout == 'PHATE':
if self.verbose >= 1:
print('Performing PHATE projection from %s to %s features...' % (self.nComponentsPCA, 2), flush=True)
try:
import phate
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('PHATE package could not be imported. Install the package following instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
self.layout = 'PCA'
if self.layout == 'PHATE':
X_projection2 = phate.PHATE().fit_transform(X_pca.T).T
if self.layout == 'PCA':
if self.verbose >= 1:
print('Using PC1 and PC2 for layout', flush=True)
X_projection2 = X_pca[[0,1],:]
if self.verbose >= 2:
print('Recording 2D projection of df_expr', flush=True)
pd.DataFrame(data=X_projection2, columns=columns).to_hdf(self.fileHDFpath, key='df_projection_pre_QC', mode='a', complevel=4, complib='zlib')
pd.DataFrame(data=X_projection2, columns=columns).to_hdf(self.fileHDFpath, key='df_projection', mode='a', complevel=4, complib='zlib')
return X_pca, _PCA.components_, X_projection2
return X_pca, _PCA.components_
[docs] def cluster(self):
'''Cluster PCA-reduced data into a desired number of clusters
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.cluster()
'''
if self.verbose >= 1:
print('Calculating clustering of PCA data', flush=True)
df_xpca = pd.read_hdf(self.fileHDFpath, key='df_xpca')
if type(self.clusteringFunction) is dict:
try:
import pynndescent
import networkx as nx
import community
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('Install packages pynndescent, networkx and python-louvain following \
instructions of DigitalCellSorter installation on GitHub. Defaulting to agglomerative clustering method')
self.clusteringFunction = AgglomerativeClustering
if type(self.clusteringFunction) is dict:
import pynndescent
import networkx as nx
import community
try:
k_neighbors = self.clusteringFunction[k_neighbors]
except Exception as exception:
if self.verbose >= 2:
print(exception)
k_neighbors = 40
try:
metric = self.clusteringFunction[metric]
except Exception as exception:
if self.verbose >= 2:
print(exception)
metric = 'euclidean'
try:
clusterExpression = self.clusteringFunction[clusterExpression]
except Exception as exception:
if self.verbose >= 2:
print(exception)
clusterExpression = False
data = self._df_expr.values.T if clusterExpression else df_xpca.values.T
if self.verbose >= 2:
print('Searching for %s nearest neighbors' % (k_neighbors), flush=True)
knn = pynndescent.NNDescent(data, metric=metric, n_neighbors=k_neighbors).query(data, k=k_neighbors)
if self.verbose >= 2:
print('k(=%s) nearest neighbors found. Constructing a NetworkX graph' % (k_neighbors), flush=True)
A = np.zeros((len(knn[0]),len(knn[0])))
for i in range(len(knn[0])):
A[i, knn[0][i]] = knn[1][i]
G = nx.from_numpy_array(A)
if self.verbose >= 2:
print('Clustering the graph', flush=True)
cellClusterIndex = pd.Series(community.best_partition(G)).sort_index().values
else:
data = df_xpca.values
cellClusterIndex = self.clusteringFunction(n_clusters=self.nClusters).fit(data.T).labels_.astype(float).astype(str)
if self.verbose >= 2:
print(np.unique(cellClusterIndex, return_counts=True))
if self.doFineClustering:
fineCellClusterIndex = cellClusterIndex.copy()
clusters = np.unique(cellClusterIndex)
for cluster in clusters:
cellsOfCluster = np.where(cellClusterIndex == cluster)[0]
subData = data[:, cellsOfCluster]
subData = subData[subData.sum(axis=1) > 0.]
if len(cellsOfCluster) >= self.minSizeForFineClustering:
try:
model = SpectralCoclustering(n_clusters=self.nFineClusters, random_state=self.random_state)
model.fit(subData.T)
tempCellClusterIndex = np.zeros((subData.T.shape[0],))
tempCellClusterIndex[:] = np.nan
for i, subCluster in enumerate(model.biclusters_[0]):
tempCellClusterIndex[np.where(subCluster)[0]] = i
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('Exception', subData.shape, cellsOfCluster.shape)
tempCellClusterIndex = np.zeros((subData.T.shape[0],))
else:
if self.verbose >= 2:
print('Small cluster', len(cellsOfCluster))
tempCellClusterIndex = np.zeros((subData.T.shape[0],))
subSizes = np.unique(tempCellClusterIndex, return_counts=True)[1]
if (subSizes < self.minSubclusterSize).any():
if self.verbose >= 2:
print('Fine clusters too small (less than %s)' % (self.minSubclusterSize), subSizes)
tempCellClusterIndex = np.zeros((subData.T.shape[0],))
fineCellClusterIndex[cellsOfCluster] = np.array([str(cluster)[:-1] + label for label in tempCellClusterIndex.astype(int).astype(str)])
if self.splitFineClusters:
for fineCluster in np.unique(fineCellClusterIndex[cellsOfCluster]):
cellsOfFineCluster = cellsOfCluster[fineCellClusterIndex[cellsOfCluster] == fineCluster]
subSubData = data[:, cellsOfFineCluster]
M = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(subSubData.T, metric='euclidean'))
subLabels = np.zeros(M.shape[0]).astype(int)
subSubSize = self.subSplitSize
subClustersSizes = []
tempNum = 0
while tempNum < len(subLabels):
if tempNum + 2*subSubSize <= len(subLabels):
subClustersSizes.append(subSubSize)
tempNum += subSubSize
else:
subClustersSizes.append(len(subLabels) - tempNum)
tempNum += len(subLabels) - tempNum
for ic, subClusterSize in enumerate(subClustersSizes):
chosenCell = np.nanmean(M, axis=0).argmax()
wh = M[chosenCell].argsort()[:subClusterSize-1]
M[chosenCell, wh] = np.nan
subLabels[wh] = [ic]*len(wh)
prevLabels = fineCellClusterIndex[cellsOfFineCluster]
addLabels = ['.' + label for label in subLabels.astype(str)]
fineCellClusterIndex[cellsOfFineCluster] = np.array([prevLabel + addLabel for prevLabel, addLabel in zip(prevLabels, addLabels)])
cellClusterIndex = fineCellClusterIndex
if 'cluster' in self._df_expr.columns.names:
self._df_expr = self._df_expr.droplevel('cluster', axis=1)
df_clusters = pd.DataFrame(data=cellClusterIndex, index=self._df_expr.columns)
df_clusters.columns = ['cluster']
df_clusters.to_hdf(self.fileHDFpath, key='df_clusters', mode='a', complevel=4, complib='zlib')
self._df_expr = pd.concat([df_clusters, self._df_expr.T], sort=False, axis=1).reset_index().set_index(['batch', 'cell', 'cluster']).T
return
[docs] def annotate(self, mapNonexpressedCelltypes = True):
'''Produce cluster voting results, annotate cell types,
and update marker expression with cell type labels
Parameters:
mapNonexpressedCelltypes: boolean, Default True
If True then cell types coloring will be consistent across all datasets,
regardless what cell types are annotated in all datasets
for a given input marker list file.
Returns:
dictionary
Voting results, a dictionary in form of:
{cluster label: assigned cell type}
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
results = DCS.annotate(df_markers_expr, df_marker_cell_type)
'''
self.loadExpressionData()
if self._df_expr is None:
return
self.prepareMarkers(expressedGenes=self._df_expr.index if mapNonexpressedCelltypes else None)
df_marker_cell_type = pd.read_hdf(self.fileHDFpath, key='df_marker_cell_type')
df_markers_expr = self._df_expr.loc[self._df_expr.index.intersection(df_marker_cell_type.index)]
if self.verbose >= 1:
print('Selected genes common with the marker list. Data shape:', df_markers_expr.shape, flush=True)
methodsToUse = self.annotationMethod.split('-')
if 'pDCS' in methodsToUse:
if self.verbose >= 1:
print('\nCalculating results by %s method' % ('pDCS'), flush = True)
annotationResults_pDCS = list(self.annotateWith_pDCS_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))
if 'ratio' in methodsToUse:
if self.verbose >= 1:
print('\nCalculating results by %s method' % ('ratio'), flush = True)
annotationResults_ratio = list(self.annotateWith_ratio_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))
if 'Hopfield' in methodsToUse:
if self.verbose >= 1:
print('\nCalculating results by %s method' % ('Hopfield'), flush = True)
annotationResults_Hopfield = list(self.annotateWith_Hopfield_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))
if ('pDCS' in methodsToUse) and ('ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
annotationResults = annotationResults_pDCS
annotationResults[2] *= annotationResults_ratio[2]
annotationResults[2] *= annotationResults_Hopfield[2]
annotationResults[2] = np.power(annotationResults[2], 1./3.)
annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.
elif ('pDCS' in methodsToUse) and ('ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
annotationResults = annotationResults_pDCS
annotationResults[2] *= annotationResults_ratio[2]
annotationResults[2] = np.power(annotationResults[2], 1./2.)
annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.
elif ('pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
annotationResults = annotationResults_pDCS
annotationResults[2] *= annotationResults_Hopfield[2]
annotationResults[2] = np.power(annotationResults[2], 1./2.)
annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.
elif (not 'pDCS' in methodsToUse) and ('ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
annotationResults = annotationResults_ratio
annotationResults[2] *= annotationResults_Hopfield[2]
annotationResults[2] = np.power(annotationResults[2], 1./2.)
annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.
elif ('pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
annotationResults = annotationResults_pDCS
elif (not 'pDCS' in methodsToUse) and ('ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
annotationResults = annotationResults_ratio
elif (not 'pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
annotationResults = annotationResults_Hopfield
else:
if self.verbose >= 1:
print('Annotation method %s not supported' % (self.annotationMethod), flush=True)
return
annotationResults = self.recordAnnotationResults(*tuple(annotationResults))
if self.verbose >= 1:
print(annotationResults, flush=True)
print('Updating the expression data with annotation results', flush=True)
df_markers_expr = df_markers_expr.T.reset_index().T
df_markers_expr.loc['label'] = np.array([annotationResults[i] for i in df_markers_expr.loc['cluster']])
df_markers_expr = df_markers_expr.T.set_index(['batch', 'cell', 'cluster', 'label']).T.apply(pd.to_numeric)
df_markers_expr.to_hdf(self.fileHDFpath, key='df_markers_expr', mode='a', complevel=4, complib='zlib')
return annotationResults
[docs] def process(self, dataIsNormalized = False, cleanData = True):
'''Process data before using any annotation of visualization functions
Parameters:
dataIsNormalized: boolean, Default False
Whether DCS.df_expr is normalized or not
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
'''
# Convert index to hugo names, clean data
if not dataIsNormalized:
self.convert()
if cleanData:
self.clean()
# Calculate QC measures
if self.doQualityControl:
self.calculateQCmeasures()
# Normalize and then correct for batch effects
if not dataIsNormalized:
self.normalize(median=self.medianScaleFactor)
if self.toggleDoBatchCorrection:
self.batchEffectCorrection()
# Calculate PCA and 2D projection
self.project()
# Remove low quality cells
if self.doQualityControl:
self.qualityControl()
# Cluster data, append cluster index to expression DataFrame
self.cluster()
# Compress and record DataFrame to dense matrix
self.recordExpressionData()
return
[docs] def visualize(self):
'''Aggregate of visualization tools of this class.
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.visualize()
'''
##############################################################################################
# Plot voting results matrix
##############################################################################################
if self.toggleMakeVotingResultsMatrixPlot:
if self.verbose >= 2:
print('Making voting results matrix plot', flush=True)
self.makeAnnotationResultsMatrixPlot()
##############################################################################################
# Plot null distributions
##############################################################################################
if self.toggleMakeHistogramNullDistributionPlot:
if self.verbose >= 2:
print('Making null distributions plot', flush=True)
self.makeHistogramNullDistributionPlot()
##############################################################################################
# Plot mean marker expression
##############################################################################################
if self.toggleMakeMarkerExpressionPlot:
if self.verbose >= 2:
print('Making marker expression plot', flush=True)
self.makeMarkerExpressionPlot()
##############################################################################################
# Make 2D projection plots
##############################################################################################
if self.toggleMakeProjectionPlotsQC and self.doQualityControl:
self.makeProjectionPlotsQualityControl()
if self.toggleMakeProjectionPlotClusters:
self.makeProjectionPlotByClusters()
if self.toggleMakeProjectionPlotBatches:
self.makeProjectionPlotByBatches()
if self.toggleMakeProjectionPlotAnnotatedClusters:
self.makeProjectionPlotAnnotated()
if self.toggleAnomalyScoresProjectionPlot:
self.makeAnomalyScoresPlot()
##############################################################################################
# Make stacked barplot of cell type fractions
##############################################################################################
if self.toggleMakeStackedBarplot:
self.makeStackedBarplot(self.subclusteringName)
##############################################################################################
# Make projection plots showing relative expression of different markers (one for each
# marker)
##############################################################################################
if self.toggleGetMarkerSubplots:
self.makeMarkerSubplots()
return None
# Plotting user functions of class #################################################
[docs] def makeProjectionPlotAnnotated(self, **kwargs):
'''Produce projection plot colored by cell types
Parameters:
Any parameters that function 'makeProjectionPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.makeProjectionPlotAnnotated()
'''
if self.verbose >= 3:
print('Making projection plot by clusters with annotated labels')
df_markers_expr = pd.read_hdf(self.fileHDFpath, key='df_markers_expr')
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
df_projection = df_projection[df_markers_expr.groupby(level=['batch', 'cell'], sort=False, axis=1).count().columns]
with open(os.path.join(self.saveDir, 'ColormapForCellTypes.txt'), 'r') as temp_file:
colormap = {item.strip().split('\t')[0]:eval(item.strip().split('\t')[1]) for item in temp_file.readlines()}
kwargs.setdefault('suffix', 'by_clusters_annotated')
kwargs.setdefault('colormap', colormap)
kwargs.setdefault('legend', False)
fig = self.makeProjectionPlot(df_projection.values, df_markers_expr.columns.get_level_values('label'), **kwargs)
return fig
[docs] def makeProjectionPlotByBatches(self, **kwargs):
'''Produce projection plot colored by batches
Parameters:
Any parameters that function 'makeProjectionPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.makeProjectionPlotByBatches()
'''
if self.verbose >= 3:
print('Making projection plot by batches')
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
batches = df_projection.columns.get_level_values('batch').values
#wh1 = np.where(batches == np.unique(batches)[0])
#wh2 = np.where(batches == np.unique(batches)[1])
#wh3 = np.where(batches == np.unique(batches)[2])
#batches[wh1] = 'BM1'
#batches[wh2] = 'BM2'
#batches[wh3] = 'BM3'
kwargs.setdefault('legend', True)
kwargs.setdefault('labels', False)
kwargs.setdefault('suffix', 'by_patients')
fig = self.makeProjectionPlot(df_projection.values, batches, **kwargs)
return fig
[docs] def makeProjectionPlotByClusters(self, **kwargs):
'''Produce projection plot colored by clusters
Parameters:
Any parameters that function 'makeProjectionPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.makeProjectionPlotByClusters()
'''
if self.verbose >= 3:
print('Making projection plot by clusters')
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
df_clusters = pd.read_hdf(self.fileHDFpath, key='df_clusters').reindex(df_projection.columns)
kwargs.setdefault('suffix', 'by_clusters')
labels = np.array(['Cluster #%s' % (label[0]) if label==label else label for label in df_clusters.values])
fig = self.makeProjectionPlot(df_projection.values, labels, **kwargs)
return fig
[docs] def makeProjectionPlotsQualityControl(self, **kwargs):
'''Produce Quality Control projection plots
Parameters:
Any parameters that function 'makeProjectionPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.makeProjectionPlotsQualityControl()
'''
def reduce(v, size = 100):
bins = np.linspace(np.min(v), np.max(v), num=size)
return bins[np.digitize(v, bins) - 1]
if self.verbose >= 3:
print('Making projection plots of QC', flush=True)
df_QC = pd.read_hdf(self.fileHDFpath, key='QC')
if self.toggleRemoveLowQualityCells:
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC')
else:
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
goodQUalityCells = pd.read_hdf(self.fileHDFpath, key='df_projection').columns
kwargs.setdefault('legend', False)
kwargs.setdefault('labels', False)
kwargs.setdefault('colorbar', True)
kwargs.update(suffix='by_number_of_genes')
self.makeProjectionPlot(df_projection.values, reduce(df_QC['number_of_genes'].values), **kwargs)
kwargs.update(suffix='by_count_depth')
self.makeProjectionPlot(df_projection.values, reduce(df_QC['count_depth'].values), **kwargs)
kwargs.update(suffix='by_fraction_of_mitochondrialGenes')
self.makeProjectionPlot(df_projection.values, reduce(df_QC['fraction_of_mitochondrialGenes'].values), **kwargs)
kwargs.update(suffix='by_is_quality_cell')
kwargs.update(colorbar=False)
self.makeProjectionPlot(df_projection.values, df_projection.columns.isin(goodQUalityCells.get_level_values('cell'), level='cell'), **kwargs)
return
[docs] def makeMarkerSubplots(self, **kwargs):
'''Produce subplots on each marker and its expression on all clusters
Parameters:
Any parameters that function 'internalMakeMarkerSubplots' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.makeMarkerSubplots()
'''
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
df_markers_expr = pd.read_hdf(self.fileHDFpath, key='df_markers_expr')
df_projection = df_projection[pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'), df_markers_expr.columns.get_level_values('cell')])]
hugo_cd_dict = dict(zip(df_markers_expr.index.values.tolist(), self.gnc.Convert(list(df_markers_expr.index), 'hugo', 'alias', returnUnknownString=False)))
kwargs.setdefault('analyzeBy', 'label')
kwargs.setdefault('NoFrameOnFigures', True)
kwargs.setdefault('HideClusterLabels', False)
kwargs.setdefault('outlineClusters', True)
self.internalMakeMarkerSubplots(df_markers_expr, df_projection.values, hugo_cd_dict, **kwargs)
return None
[docs] def makeAnomalyScoresPlot(self, cells = 'All', suffix = '', noPlot = False, **kwargs):
'''Make anomaly scores plot
Parameters:
cells: pandas.MultiIndex, Default 'All'
Index of cells of interest
Any parameters that function 'makeProjectionPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
cells = DCS.getCells(celltype='T cell')
DCS.makeAnomalyScoresPlot(cells)
'''
if self._df_expr is None:
self.loadExpressionData()
if type(cells) is str:
if cells == 'All':
cells = pd.MultiIndex.from_arrays([self._df_expr.columns.get_level_values('batch'),
self._df_expr.columns.get_level_values('cell') ], names=['batch', 'cell'])
else:
if self.verbose >= 1:
print('"cells" value "%s" is unknown' % (cells))
raise ValueError
df_sel = self.getExprOfCells(cells)
df_sel.columns = pd.MultiIndex.from_arrays([df_sel.columns.get_level_values('batch'),
df_sel.columns.get_level_values('cell') ], names=['batch', 'cell'])
scores = self.getAnomalyScores(df_sel, df_sel)
scores = pd.DataFrame(index=df_sel.columns, data=scores)
scores.columns = ['score']
scores.sort_values(by='score', ascending=False).to_excel(os.path.join(self.saveDir, 'anomalyScores%s.xlsx' % (suffix)), merge_cells=False)
if noPlot:
return
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
scores = scores.reindex(df_projection.columns).values.T[0]
kwargs.setdefault('suffix', 'by_anomaly_score')
kwargs.setdefault('legend', False)
kwargs.setdefault('labels', False)
kwargs.setdefault('colorbar', True)
fig = self.makeProjectionPlot(df_projection.values, scores, **kwargs)
return fig
[docs] def makeIndividualGeneTtestPlot(self, gene, analyzeBy = 'label', **kwargs):
'''Produce individual gene t-test plot of the two-tailed p-value.
Parameters:
gene: str
Name of gene of interest
analyzeBy: str, Default 'label'
What level of lablels to include.
Other possible options are 'label' and 'celltype'
Any parameters that function 'makeTtestPlot' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.makeIndividualGeneTtestPlot('SDC1')
'''
df_markers_expr = self.getExprOfGene(gene, analyzeBy=analyzeBy)
if df_markers_expr is None:
return None
groups = np.unique(df_markers_expr.columns.get_level_values(analyzeBy).values)
ttestStatistic = pd.DataFrame(index=groups, columns=groups)
ttestpValue = pd.DataFrame(index=groups, columns=groups)
for groupA in groups:
for groupB in groups:
A = df_markers_expr.xs(key=groupA, level=analyzeBy, axis=1).values.squeeze()
B = df_markers_expr.xs(key=groupB, level=analyzeBy, axis=1).values.squeeze()
ttestStatistic.loc[groupA, groupB], ttestpValue.loc[groupA, groupB] = scipy.stats.ttest_ind(A[np.where(A != 0)], B[np.where(B != 0)])
alt = self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]
alt = [alt] if type(alt) is str else alt
kwargs.setdefault('label', '%s\n(%s)' % (gene, ('\n').join(list(alt))))
fig = self.makeTtestPlot(ttestStatistic, ttestpValue, **kwargs)
return fig
[docs] def makeIndividualGeneExpressionPlot(self, genes, **kwargs):
'''Produce individual gene expression plot on a 2D layout
Parameters:
gene: str, or list-like
Name of gene of interest. E.g. 'CD4, CD33', 'PECAM1', ['CD4', 'CD33']
hideClusterLabels: boolean, Default False
Whether to hide the clusters labels
outlineClusters: boolean, Default True
Whether to outline the clusters with circles
Any parameters that function 'internalMakeMarkerSubplots' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.makeIndividualGeneExpressionPlot('CD4')
'''
kwargs.setdefault('analyzeBy', 'cluster')
kwargs.setdefault('NoFrameOnFigures', True)
kwargs.setdefault('HideClusterLabels', False)
kwargs.setdefault('outlineClusters', True)
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
if type(genes) is str:
genes = genes.replace(' ', '').split(',')
for gene in genes:
df_markers_expr = self.getExprOfGene(gene, analyzeBy=kwargs['analyzeBy'])
if not df_markers_expr is None:
hugo_cd_dict = {gene: self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]}
cells = pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'),
df_markers_expr.columns.get_level_values('cell')],
names=['batch', 'cell'])
self.internalMakeMarkerSubplots(df_markers_expr,
df_projection.reindex(cells, axis=1).values,
hugo_cd_dict, **kwargs)
return None
[docs] def makeHopfieldLandscapePlot(self, meshSamplingRate = 1000, plot3D = True, reuseData = False, **kwargs):
'''Make and plot Hopfield landscape
Parameters:
meshSamplingRate: int, Default 1000
Defines quality of sampling around attractor states
plot3D: boolean, Default False
Whether to plot 2D or 3D figure
reuseData: boolean, Default False
Whether to attempt using precalculated data.
Any parameters that function 'HopfieldLandscapePlot' or 'HopfieldLandscapePlot3D' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.makeHopfieldLandscapePlot()
'''
dataPath = os.path.join(self.saveDir, 'Hopfiled_mode_1')
if reuseData and (not os.path.exists(dataPath)):
reuseData = False
if not reuseData:
df_attrs = self.readMarkerFile()
self.propagateHopfield(xi=df_attrs.values, typesNames=df_attrs.columns.values, path=dataPath, meshSamplingRate=meshSamplingRate, mode=1)
if plot3D:
fig = self.HopfieldLandscapePlot3D(trPath=dataPath, **kwargs)
else:
fig = self.HopfieldLandscapePlot(trPath=dataPath, colorbarva=0.1, colorbarha=0.82, fontsize=17, **kwargs)
return fig
# Extraction user functions of class ###############################################
[docs] def getAnomalyScores(self, trainingSet, testingSet, printResults = False):
'''Function to get anomaly score of cells based on some reference set
Parameters:
trainingSet: pandas.DataFrame
With cells to trail isolation forest on
testingSet: pandas.DataFrame
With cells to score
printResults: boolean, Default False
Whether to print results
Returns:
1d numpy.array
Anomaly score(s) of tested cell(s)
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
cutoff = DCS.getAnomalyScores(df_expr.iloc[:, 5:], df_expr.iloc[:, :5])
'''
if self.verbose >= 1:
print('Calculating anomaly scores of the selected cells', flush=True)
instanceIsolationForest = IsolationForest(n_jobs=self.availableCPUsCount, max_samples=np.min([trainingSet.shape[1] - 1, 256]))
if self.verbose >= 2:
print('Training isolation forest')
instanceIsolationForest.fit(trainingSet.values.T)
if type(testingSet) is pd.Series:
testingSet = testingSet.to_frame()
if self.verbose >= 2:
print('Scoring samples')
scores = instanceIsolationForest.score_samples(testingSet.values.T)
scores *= -1.
results = list(zip(testingSet.columns, scores))
if printResults:
for cell in results:
print('Cell:', cell[0], 'Score:', cell[1])
return scores
[docs] def getHugoName(self, gene, printAliases = False):
'''Get gene hugo name(s).
Parameters:
gene: str
'hugo' or 'alias' name of a gene
Returns:
str
Hugo name if found, otherwise input name
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.getHugoName('CD138')
'''
return self.gnc.Convert(gene, 'alias', 'hugo', returnUnknownString=False)
[docs] def getExprOfGene(self, gene, analyzeBy = 'cluster'):
'''Get expression of a gene.
Run this function only after function process()
Parameters:
cells: pandas.MultiIndex
Index of cells of interest
analyzeBy: str, Default 'cluster'
What level of lablels to include.
Other possible options are 'label' and 'celltype'
Returns:
pandas.DataFrame
With expression of the cells of interest
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.getExprOfGene('SDC1')
'''
fileName = self.fileHDFpath
if not os.path.isfile(fileName):
if self.verbose >= 1:
print('Processed data file not found')
return
try:
df_markers_expr = pd.read_hdf(fileName, key='df_expr').xs(key=gene, axis=0, level=-1).T
except Exception as exception:
if self.verbose >= 2:
print(exception)
print('Gene %s not in index' % (gene))
return
df_markers_expr.index = [gene]
df_markers_expr.columns = pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'), df_markers_expr.columns.get_level_values('cell')])
labelled = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').columns
columns = pd.MultiIndex.from_arrays([labelled.get_level_values('batch'), labelled.get_level_values('cell')])
df_markers_expr = df_markers_expr.reindex(columns, axis=1).fillna(0.)
if analyzeBy == 'celltype':
analyzeBy = 'label'
columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index
df_markers_expr.columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'),
columns.get_level_values('cell'),
columns.get_level_values(analyzeBy).str.split(' #', expand=True).get_level_values(0)], names=['batch', 'cell','celltype'])
else:
df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index
return df_markers_expr
[docs] def getExprOfCells(self, cells):
'''Get expression of a set of cells.
Run this function only after function process()
Parameters:
cells: pandas.MultiIndex
2-level Index of cells of interest, must include levels 'batch' and 'cell'
Returns:
pandas.DataFrame
With expression of the cells of interest
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
DCS.getExprOfCells(cells)
'''
if self._df_expr is None:
self.loadExpressionData()
index = pd.MultiIndex.from_arrays([self._df_expr.columns.get_level_values('batch'),
self._df_expr.columns.get_level_values('cell') ], names=['batch', 'cell'])
if cells.reindex(index)[1] is None:
if self.verbose >= 2:
print('Selected expression of all cells')
return self.df_expr
return self._df_expr[self._df_expr.columns[cells.reindex(index)[1] > -1]]
[docs] def getCells(self, celltype = None, clusterIndex = None, clusterName = None):
'''Get cell annotations in a form of pandas.Series
Parameters:
celltype: str, Default None
Cell type to extract
clusterIndex: int, Default None
Cell type to extract
clusterName: str, Default None
Cell type to extract
Returns:
pandas.MultiIndex
Index of labelled cells
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.process()
labels = DCS.getCells()
'''
df = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').columns.to_frame()
if not clusterIndex is None:
se = df.set_index(['batch', 'cell'])['cluster']
condition = se == clusterIndex
if not condition.any():
if self.verbose >= 1:
print('No cells found')
return None
selectedCells = se[condition].index
if self.verbose >= 2:
print('Selected %s cells from cluster: %s' % (len(selectedCells), clusterIndex))
return selectedCells
if not clusterName is None:
se = df.set_index(['batch', 'cell'])['cluster']
condition = se == eval(clusterName.split('#')[1])
if not condition.any():
if self.verbose >= 1:
print('No cells found')
return None
selectedCells = se[condition].index
if self.verbose >= 2:
print('Selected %s cells from cluster: %s' % (len(selectedCells), clusterName))
return selectedCells
se = df.set_index(['batch', 'cell'])['label']
if celltype is None and clusterIndex is None:
if self.verbose >= 1:
print('Available cell types:', np.unique(se.str.split(' #', expand=True)[0].values))
print('Note: To get a specific cell type call this function with a specified cell type')
return se
condition = se.str.find(celltype) != -1
if not condition.any():
if self.verbose >= 1:
print('No cells found')
return None
selectedCells = se[condition].index
if self.verbose >= 2:
print('Selected %s cells of type: %s' % (len(selectedCells), celltype))
return selectedCells
[docs] def getIndexOfGoodQualityCells(self, QCplotsSubDir = 'QC_plots', **kwargs):
'''Get index of sells that satisfy the QC criteria
Parameters:
count_depth_cutoff: float, Default 0.5
Fraction of median to take as count depth cutoff
number_of_genes_cutoff: float, Default 0.5
Fraction of median to take as number of genes cutoff
mitochondrial_genes_cutoff: float, Default 3.0
The cutoff is median + standard_deviation * this_parameter
Any parameters that function 'makeQualityControlHistogramPlot' can accept
Returns:
pandas.Index
Index of cells
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
index = DCS.getIndexOfGoodQualityCells()
'''
df_QC = pd.read_hdf(self.fileHDFpath, key='QC')
if not self.excludedFromQC is None:
df_QC = df_QC.loc[df_QC.index.difference(self.excludedFromQC)]
plotsDir = os.path.join(self.saveDir, QCplotsSubDir, '')
kwargs.setdefault('MakeHistogramPlot', True)
# Calculate cutoffs
cutoff_count_depth = self.getQualityControlCutoff(df_QC['count_depth'], self.countDepthCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_count_depth' % (self.dataName)), precut=self.countDepthPrecutQC, **kwargs)
cutoff_number_of_genes = self.getQualityControlCutoff(df_QC['number_of_genes'], self.numberOfGenesCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_number_of_genes' % (self.dataName)), precut=self.numberOfGenesPrecutQC, **kwargs)
cutoff_fraction_of_mitochondrialGenes = self.getQualityControlCutoff(df_QC['fraction_of_mitochondrialGenes'], self.mitochondrialGenesCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_fraction_of_mitochondrialGenes' % (self.dataName)), mito=True, **kwargs)
df_QC['isGoodQuality'] = np.zeros(len(df_QC)).astype(bool)
mask = (df_QC['count_depth'] > cutoff_count_depth) & \
(df_QC['number_of_genes'] > cutoff_number_of_genes) & \
(df_QC['fraction_of_mitochondrialGenes'] < cutoff_fraction_of_mitochondrialGenes)
index = df_QC['isGoodQuality'].index[mask]
if not self.excludedFromQC is None:
index = pd.concat([pd.Series(index=index), pd.Series(index=self.excludedFromQC)]).index
return index.sort_values()
[docs] def getQualityControlCutoff(self, se, cutoff, precut = 1., mito = False, MakeHistogramPlot = True, **kwargs):
'''Function to calculate QC quality cutoff
Parameters:
se: pandas.Series
With data to analyze
cutoff: float
Parameter for calculating the quality control cutoff
mito: boolean, Default False
Whether the analysis of mitochondrial genes fraction
plotPathAndName: str, Default None
Text to include in the figure title and file name
MakeHistogramPlot: boolean, Default True
Whether to make a histogram plot
Any parameters that function 'makeQualityControlHistogramPlot' can accept
Returns:
float
Cutoff value
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
cutoff = DCS.getQualityControlCutoff(se)
'''
subset = se[~np.isnan(se.values)].values
if mito:
median = np.round(np.median(subset),4)
std = np.round(np.std(subset),4)
hist_of_subset = scipy.stats.rv_histogram(np.histogram(subset, bins=100, range=(0, 0.3)))
hist_data = hist_of_subset._hpdf / 100
hist_bins = hist_of_subset._hbins
xs = np.linspace(hist_bins[0], hist_bins[-1], 1000)
spline_data = np.vstack((xs, UnivariateSpline(hist_bins, hist_data[:-1], k=5, s=0)(xs))).T
smoothed = scipy.signal.savgol_filter(spline_data.T[1], 101, 3)
x1 = spline_data.T[0][np.where(spline_data.T[0] > median)[0][0]]
y1 = smoothed[np.where(spline_data.T[0] > median)[0][0]]
wh = np.where(spline_data.T[0] > (median + cutoff * std))[0][0]
x2 = spline_data.T[0][wh]
y2 = smoothed[wh]
cutoff = min(median + 3. * std, x1 - y1 * (x2 - x1) / (y2 - y1))
cutoff = max(median + 1. * std, cutoff)
else:
if self.precutQC:
cutoff = int(cutoff * np.median(subset[subset > precut]))
cutoff = min(cutoff, 2 * precut)
else:
cutoff = int(cutoff * np.median(subset))
kwargs.setdefault('mito', mito)
if MakeHistogramPlot:
self.makeQualityControlHistogramPlot(subset, cutoff, **kwargs)
return cutoff
[docs] def getCountsDataframe(self, se1, se2, tagForMissing = 'N/A'):
'''Get a pandas.DataFrame with cross-counts (overlaps) between two pandas.Series
Parameters:
se1: pandas.Series
Series with the first set of items
se2: pandas.Series
Series with the second set of items
tagForMissing: str, Default 'N/A'
Label to assign to non-overlapping items
Returns:
pandas.DataFrame
Contains counts
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df = DCS.getCountsDataframe(se1, se2)
'''
df = self.alignSeries(se1, se2, tagForMissing)
counts = {group[0]:{k:len(v) for k, v in group[1].groupby(by='se1').groups.items()} for group in df.reset_index(drop=True).set_index('se2').groupby('se2')}
df = pd.DataFrame.from_dict(counts).fillna(0.0).astype(int)
moveTag = lambda df: pd.concat([df.iloc[np.where(df.index != tagForMissing)[0]], df.iloc[np.where(df.index == tagForMissing)[0]]], axis=0, sort=False) if tagForMissing in df.index else df
return moveTag(moveTag(df.T).T)
[docs] def getNewMarkerGenes(self, cluster = None, top = 100, zScoreCutoff = None, removeUnknown = False, **kwargs):
'''Extract new marker genes based on the cluster annotations
Parameters:
cluster: int, Default None
Cluster #, if provided genes of only this culster will be returned
top: int, Default 100
Upper bound for number of new markers per cell type
zScoreCutoff: float, Default 0.3
Lower bound for a marker z-score to be significant
removeUnknown: boolean, Default False
Whether to remove type "Unknown"
Any parameters that function 'makePlotOfNewMarkers' can accept
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.extractNewMarkerGenes()
'''
if zScoreCutoff is None:
zScoreCutoff = self.zScoreCutoff
if self._df_expr is None:
self.loadExpressionData()
if self._df_expr is None:
return
df_gene_cluster_centroids = self._df_expr.groupby(level=['cluster'], sort=True, axis=1).mean()
df_gene_cluster_centroids_merged = pd.DataFrame()
df_votingResults = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_annotation.xlsx'), sheet_name='z-scores', index_col='cluster')[['# cells in cluster', 'Predicted cell type']]
groups = pd.concat([df_votingResults['# cells in cluster'], df_votingResults['Predicted cell type'].str.split(' #', expand=True)[0]], sort=False, axis=1).reset_index().set_index(0).groupby(by=0)
for group in groups:
se_cluster_size = group[1].reset_index().set_index('cluster')['# cells in cluster']
se_type = (df_gene_cluster_centroids[se_cluster_size.index] * se_cluster_size.values).sum(axis=1) / se_cluster_size.values.sum()
se_type.name = group[0]
df_gene_cluster_centroids_merged = pd.concat([df_gene_cluster_centroids_merged, se_type], sort=False, axis=1)
df_gene_cluster_centroids_merged = df_gene_cluster_centroids_merged.apply(self.zScoreOfSeries, axis=1)
if not cluster is None:
clusterGenes = df_gene_cluster_centroids[cluster].sort_values(ascending=False)
clusterGenes = clusterGenes[clusterGenes >= zScoreCutoff].iloc[:top]
return {'genes':clusterGenes.index.values.tolist(), 'zscore':clusterGenes.values.tolist()}
if removeUnknown and self.nameForUnknown in df_gene_cluster_centroids_merged.columns:
df_gene_cluster_centroids_merged.drop(columns=[self.nameForUnknown], inplace=True)
df_new_marker_genes = df_gene_cluster_centroids_merged.T
df_new_marker_genes[df_new_marker_genes < 0.] = 0.
if self.verbose >= 2:
print('New marker cell type shape:', df_new_marker_genes.shape)
df_new_marker_genes = df_new_marker_genes.T.loc[df_new_marker_genes.max(axis=0) >= zScoreCutoff].T
if self.verbose >= 2:
print('New marker cell type shape:', df_new_marker_genes.shape)
df_new_marker_genes = df_new_marker_genes.loc[df_new_marker_genes.max(axis=1) >= zScoreCutoff]
if self.verbose >= 2:
print('New marker cell type shape:', df_new_marker_genes.shape)
df_new_marker_list = pd.DataFrame()
for i, celltype in enumerate(df_new_marker_genes.index.values):
all = df_new_marker_genes.loc[celltype]
sel = all[all > 1.0].sort_values()[:top]
if self.verbose >= 3:
print('Candidates of %s:' % (celltype), len(sel))
df_new_marker_list = pd.concat([df_new_marker_list, sel], sort=False, axis=1)
df_new_marker_list = df_new_marker_list.fillna(0.).sort_index()
df_new_marker_list[df_new_marker_list > 0.] = 1.0
if self.nameForUnknown in df_new_marker_list.columns:
df_new_marker_list.drop(columns=[self.nameForUnknown], inplace=True)
df_new_marker_list.index.name = 'Marker'
fileName = os.path.join(self.saveDir, 'new_markers.xlsx')
if self.verbose >= 3:
print('Recording results to:', fileName)
writer = pd.ExcelWriter(fileName)
df_new_marker_list.to_excel(writer, 'MarkerCellType')
df_temp = pd.Series(data=df_new_marker_list.columns, index=df_new_marker_list.columns)
df_temp.name = 'CellTypeGrouped'
df_temp.index.name = 'CellType'
df_temp.to_excel(writer, 'CellTypesGrouped')
writer.save()
df_marker_cell_type = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_annotation.xlsx'),
sheet_name='Marker cell type weight matrix', index_col=0).T
df_new_marker_genes = df_new_marker_genes[df_new_marker_list.index]
self.makePlotOfNewMarkers(df_marker_cell_type, df_new_marker_genes, **kwargs)
return None
# Annotation functions of class ####################################################
[docs] @classmethod
def calculateV(cls, args):
'''Calculate the voting scores (celltypes by clusters)
Parameters:
args: tuple
Tuple of sub-arguments
df_M: pandas.DataFrame
Marker cell type DataFrame
df_X: pandas.DataFrame
Markers expression DataFrame
cluster_index: 1d numpy.array
Clustering index
cutoff: float
Significance cutoff, i.e. a threshold for a given marker to be significant
giveSignificant: boolean
Whether to return the significance matrix along with the scores
removeLowQCscores: boolean
Whether to remove low quality scores,
i.e. those with less than 10% of markers that a re supporting
Returns:
pandas.DataFrame
Contains voting scores per celltype per cluster
Usage:
Function is used internally.
df = calculateV((df_M, df_X, cluster_index, 0.3, False, True))
'''
df_M, df_X, cluster_index, cutoff, giveSignificant, removeLowQCscores = args
df_Y = pd.DataFrame(data=df_X.values, index=df_X.index, columns=cluster_index).groupby(level=0, sort=True, axis=1).mean()
if True:
df_Z = (df_Y.copy() - np.mean(df_Y.values, axis=1)[:, None]) / np.std(df_Y.values, axis=1)[:, None]
df_Z = 1. * (df_Z > cutoff)
else:
df_Z = df_Y > ((1. + cutoff) * np.mean(df_Y.values, axis=1)[:, None])
df_V = pd.Series(df_M.dot(df_Z).stack().fillna(0.))
# Remove scores with small number of supporting markers
if removeLowQCscores:
df_Q = (1. * (df_M > 0.)).dot(1. * (df_Z > 0.)) < (0.1 * (df_M > 0.).sum(axis=1)[:, None]).astype(int)
#df_Q = (1. * (df_M > 0.)).dot(1. * (df_Z > 0.)) < (0.1 * (1. * (df_Z > 0.)).sum(axis=0)[None, :]).astype(int)
df_V[pd.Series(df_Q.stack().fillna(0.))] = 0.
if giveSignificant:
return df_V, df_Z
return df_V
[docs] def annotateWith_pDCS_Scheme(self, df_markers_expr, df_marker_cell_type):
'''Produce cluster annotation results
Parameters:
df_markers_expr: pandas.DataFrame
Data with marker genes by cells expression
df_marker_cell_type: pandas.DataFrame
Data with marker genes by cell types
Returns:
tuple
Usage:
Function should be called internally only
'''
df_positive = df_marker_cell_type.copy()
df_negative = df_marker_cell_type.copy()
df_positive[df_marker_cell_type < .0] = 0.
df_negative[df_marker_cell_type > .0] = 0.
df_positive /= (df_positive > 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
df_negative /= (df_negative < 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
df_marker_cell_type = df_positive + df_negative
if self.verbose >= 2:
print(df_marker_cell_type.shape)
# Align df_markers_expr and df_marker_cell_type
df_markers_expr.sort_index(inplace=True, axis=0)
df_marker_cell_type.sort_index(inplace=True, axis=1)
# Calculate score (Vkc) for the best clustering
temp = self.calculateV((df_marker_cell_type,
df_markers_expr,
df_markers_expr.columns.get_level_values('cluster').values,
self.zScoreCutoff, True, self.removeLowQualityScores))
df_V = temp[0].unstack()
df_V.index.name = None
df_Z_best = temp[1]
dict_expressed_markers = {cluster:df_Z_best.index[df_Z_best[cluster] > 0].values for cluster in df_Z_best}
np.random.seed(0)
df_markers_expr_copy = df_markers_expr.copy()
uclusters = np.unique(df_markers_expr_copy.columns.get_level_values(2).astype(str))
uindex = pd.Series(df_markers_expr_copy.columns.get_level_values(2).astype(str)).replace(dict(zip(uclusters, range(len(uclusters))))).values
df_markers_expr_copy.columns = pd.MultiIndex.from_arrays([df_markers_expr_copy.columns.get_level_values(0),
df_markers_expr_copy.columns.get_level_values(1),
uindex], names=df_markers_expr_copy.columns.names)
batch_size = self.availableCPUsCount * 80
if self.nSamples_pDCS < batch_size:
self.nSamples_pDCS = batch_size
if self.verbose >= 2:
print('Distribution size too small. Increasing it to minimum batch size: %s' % (self.nSamples_pDCS))
N_batches = int(self.nSamples_pDCS / batch_size)
if self.nSamples_pDCS != batch_size * N_batches:
self.nSamples_pDCS = batch_size * N_batches
if self.verbose >= 2:
print('Adjusting distribution size: %s' % (self.nSamples_pDCS))
# Generate random cluster index
randomClusterIndex = np.vstack([np.random.choice(df_markers_expr_copy.columns.get_level_values('cluster'), size=df_markers_expr_copy.shape[1], replace=False) for i in range(self.nSamples_pDCS)]).astype(int)
# Generate random cluster configurations and calculate scores (Pkc) of those
if self.verbose >= 2:
print('Generating null distribution')
def process_batch(batch_range):
if self.verbose >= 2:
print('\t', batch_range)
tuples = [(df_marker_cell_type, df_markers_expr_copy, randomClusterIndex[i], self.zScoreCutoff, False, False) for i in batch_range]
pool = multiprocessing.Pool(processes = self.availableCPUsCount)
temp_random_df_V = pd.concat(pool.map(self.calculateV, tuples), sort=False, axis=1)
pool.close()
pool.join()
return temp_random_df_V
random_df_V = pd.concat([process_batch(range(i * batch_size,(i + 1) * batch_size)) for i in range(N_batches)], sort=False, axis=1)
random_df_V.index = pd.MultiIndex.from_arrays([random_df_V.index.get_level_values(0),
pd.Series(random_df_V.index.get_level_values(1)).replace(dict(zip(range(len(uclusters)), uclusters))).values],
names=[random_df_V.index.names[0], 'cluster'])
random_df_V = random_df_V.replace(0., np.nan)
min_value = np.nanmin(random_df_V.values)
max_value = np.nanmax(random_df_V.values)
if self.verbose >= 2:
print('Min:', min_value, '\t', 'Max:', max_value)
Nbins = 300
# Calculate null distribution histograms data for plots
df_null_distributions = random_df_V.apply(lambda s: scipy.stats.rv_histogram(np.histogram(s, bins=Nbins, range=(min_value,max_value)))._hpdf / Nbins, axis=1).apply(pd.Series).T
df_null_distributions.columns.names = ['CellType', 'Cluster']
df_null_distributions.index = [min_value] + (scipy.stats.rv_histogram(np.histogram(random_df_V.iloc[0], bins=Nbins, range=(min_value,max_value)))._hbins).tolist()
# Calculate z-score (Lkc) for the best clustering
if self.verbose >= 2:
print('Processing voting results')
sigma = pd.Series(data=np.nanstd(random_df_V.values, axis=1), index=random_df_V.index).replace(0., np.inf).unstack()
mean = pd.Series(data=np.nanmean(random_df_V.values, axis=1), index=random_df_V.index).unstack()
df_L = (df_V - mean) / sigma
df_L[df_L < self.thresholdForUnknown_pDCS] = 0.
df_L[df_V < 0.] = 0.
return (df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, df_null_distributions)
[docs] def annotateWith_ratio_Scheme(self, df_markers_expr, df_marker_cell_type):
'''Produce cluster annotation results
Parameters:
df_markers_expr: pandas.DataFrame
Data with marker genes by cells expression
df_marker_cell_type: pandas.DataFrame
Data with marker genes by cell types
Returns:
tuple
Usage:
Function should be called internally only
'''
df_positive = df_marker_cell_type.copy()
df_negative = df_marker_cell_type.copy()
df_positive[df_marker_cell_type < .0] = 0.
df_negative[df_marker_cell_type > .0] = 0.
df_positive /= (df_positive > 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
df_negative /= (df_negative < 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
df_marker_cell_type = df_positive + df_negative
if self.verbose >= 2:
print(df_marker_cell_type.shape)
# Align df_markers_expr and df_marker_cell_type
df_markers_expr.sort_index(inplace=True, axis=0)
df_marker_cell_type.sort_index(inplace=True, axis=1)
# Calculate score (Vkc) for the best clustering
temp = self.calculateV((df_marker_cell_type,
df_markers_expr,
df_markers_expr.columns.get_level_values('cluster').values,
self.zScoreCutoff, True, self.removeLowQualityScores))
df_V = temp[0].unstack()
df_V.index.name = None
df_Z_best = temp[1]
dict_expressed_markers = {cluster:df_Z_best.index[df_Z_best[cluster] > 0].values for cluster in df_Z_best}
df_marker_cell_type_full = self.readMarkerFile().T.loc[df_marker_cell_type.index]
df_marker_cell_type_full[df_marker_cell_type_full < .0] = 0.
df_marker_cell_type_full /= (df_marker_cell_type_full > 0.).sum(axis=0).fillna(1.).replace(0., 1.)
df_L = df_V / df_marker_cell_type_full.sum(axis=1).values[:, None]
df_L[df_L < self.thresholdForUnknown_ratio] = 0.
df_L[df_V < 0.] = 0.
return (df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, None)
[docs] def annotateWith_Hopfield_Scheme(self, df_markers_expr, df_marker_cell_type):
'''Produce cluster annotation results
Parameters:
df_markers_expr: pandas.DataFrame
Markers expression DataFrame
df_marker_cell_type: pandas.DataFrame
Marker cell type DataFrame
Returns:
tuple
Usage:
Function should be called internally only
'''
df_marker_cell_type = df_marker_cell_type.T
np.random.seed(0)
# The scheme does not account for negative markers.
# Therefore remove them
df_marker_cell_type[df_marker_cell_type > 0.] = 1.
df_marker_cell_type[df_marker_cell_type < 0.] = 0.
# Keep markers that have +1 or -1 for at least one cell type
df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]
# Check and keep only cell types with at least a minium number of +1 markers
df_marker_cell_type = df_marker_cell_type[df_marker_cell_type.columns[np.abs(df_marker_cell_type).sum(axis=0) > self.minimumNumberOfMarkersPerCelltype]]
if self.verbose >= 2:
print(df_marker_cell_type.shape)
# Remove unused markers
df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]
# Normalize marker/celltype matrix in both direction by counts.
# Note that order of the following two lines is irrelevant
df_marker_cell_type[df_marker_cell_type > 0.] /= (df_marker_cell_type[df_marker_cell_type > 0.] > 0).sum(axis=1).replace(0, 1.).values[:, None]
df_marker_cell_type[df_marker_cell_type > 0.] /= (df_marker_cell_type[df_marker_cell_type > 0.] > 0).sum(axis=0).replace(0, 1.).values
df_marker_cell_type[df_marker_cell_type < 0.] /= (df_marker_cell_type[df_marker_cell_type < 0.] < 0).sum(axis=1).replace(0, 1.).values[:, None]
df_marker_cell_type[df_marker_cell_type < 0.] /= (df_marker_cell_type[df_marker_cell_type < 0.] < 0).sum(axis=0).replace(0, 1.).values
if self.verbose >= 2:
print(df_marker_cell_type.shape)
# Order gene expression index in the same order as markers index
df_markers_expr = df_markers_expr.loc[df_marker_cell_type.index]
df_expr_cluster_centroids = df_markers_expr.groupby(level='cluster', axis=1, sort=False).mean()
df_Z = df_expr_cluster_centroids.copy().apply(self.zScoreOfSeries, axis=1)
df_sig = df_Z > self.zScoreCutoff
dict_expressed_markers = {cluster:df_sig.index[df_sig[cluster] > 0].values for cluster in df_sig}
if self.useUnderlyingNetwork:
try:
underlyingNetwork = self.getSubnetworkOfPCN(df_marker_cell_type.index.values, min_shared_first_targets=1)
if self.verbose >= 3:
print('Recording underlying network', flush=True)
underlyingNetwork.to_hdf(self.fileHDFpath, key='df_underlyingNetwork', mode='a', complevel=4, complib='zlib')
underlyingNetwork = underlyingNetwork.values
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('Not using underlying network connection')
underlyingNetwork = None
else:
underlyingNetwork = None
if self.verbose >= 2:
print('Generating Hopfield networks:', flush=True)
listOfSeries = []
for i in range(self.nSamples_Hopfield):
if i % np.int(self.nSamples_Hopfield / 10) == 0:
if self.verbose >= 2:
print('\n%s%%' % (np.int(100 * i / self.nSamples_Hopfield)), end='\t', flush=True)
# Run Hopfield network dynamics
hop = self.propagateHopfield(sigma=df_sig.values * 2. - 1.,
xi=df_marker_cell_type.values,
typesNames=df_marker_cell_type.columns.values, underlyingNetwork=underlyingNetwork,
clustersNames=df_sig.columns.values, id = i, T = self.HopfieldTemperature,
path = os.path.join(self.saveDir, 'HopfieldTrajectories'),
recordTrajectories = False, verbose=self.verbose)
# Determine top scores
res = np.argmax(hop, axis=0)
# Best votes that below threshold are set to Unknown
res[np.where(np.max(hop, axis=0) < 10. ** -12)] = len(df_marker_cell_type.columns)
listOfSeries.append(res)
if self.verbose >= 2:
print(flush=True)
r = np.vstack(listOfSeries)
df_L = pd.DataFrame(index=list(range(len(df_marker_cell_type.columns) + 1)), columns=df_sig.columns).fillna(0.)
for i in range(len(df_sig.columns)):
w = np.unique(r[:,i], return_counts=True)
for j in range(len(w[0])):
df_L.iloc[w[0][j], i] = w[1][j] / len(r)
df_L[df_L < self.thresholdForUnknown_Hopfield] = 0.
df_L.iloc[-1, :] = 0.9 * self.thresholdForUnknown_Hopfield
# Best results are chosen as highest frequency cell type for each cluster
se_celltypes = pd.Series(data = np.argmax(df_L.values, axis=0).astype(int).astype(str), index = df_sig.columns)
# Define cell names dictionary
celltypeNames = dict(zip(np.array(range(len(df_marker_cell_type.columns))).astype(int).astype(str), df_marker_cell_type.columns))
celltypeNames.update({str(len(celltypeNames)): self.nameForUnknown})
# Convert cell names from index to words
se_celltypes = se_celltypes.replace(to_replace=celltypeNames)
df_L.index = pd.Series(df_L.index.astype(str)).replace(to_replace=celltypeNames).values
df_L.drop(self.nameForUnknown, axis=0, inplace=True)
return (df_marker_cell_type.T, df_markers_expr, df_L, df_L.copy(), dict_expressed_markers)
[docs] def recordAnnotationResults(self, df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, df_null_distributions = None):
'''Record cell type annotation results to spreadsheets.
Parameters:
df_marker_cell_type: pandas.DataFrame
Markers to cell types table
df_markers_expr: pandas.DataFrame
Markers expression in each cluster
df_L: pandas.DataFrame
Annotation scores along with other information
df_V: pandas.DataFrame
Annotation scores along with other information
dict_expressed_markers: dictionary
Dictionary of markers signigicantly expressed in each cluster
df_null_distributions: pandas.DataFrame, Default None
Table with null distributions
Returns:
None
Usage:
This function is intended to be used internally only
'''
df_L = df_L.fillna(0.)
# Determine cluster sizes
se_cluster_sizes = df_markers_expr.groupby(level='cluster', sort=True, axis=1).count().iloc[0]
cluster_sizes = se_cluster_sizes.values
df_L = df_L[se_cluster_sizes.index]
df_V = df_V[se_cluster_sizes.index]
# Rename winning cell types
T = df_L.index[np.argmax(df_L.values, axis=0)].values
T[(df_L.values <= 0.).all(axis=0)] = self.nameForUnknown
T = pd.Index(T) + ' #0'
for i in range(len(T)):
T = T.where(~T.duplicated(), T.str.replace(' #%s' % (i), ' #%s' % (i + 1)))
predicted_celltype = T.values
# Save score columns names
columns_scores = df_V.index.values.copy().tolist()
# Determine winning score
winning_score = np.array([df_V.iloc[ind[0], ind[1]] for ind in np.flip(np.array(list(enumerate(np.argmax(df_L.values, axis=0)))), axis=1)])
# Update the DataFrames
df_V.loc['Winning score'] = df_L.loc['Winning score'] = winning_score
df_V.loc['Predicted cell type'] = df_L.loc['Predicted cell type'] = predicted_celltype
df_V.loc['# cells in cluster'] = df_L.loc['# cells in cluster'] = cluster_sizes
# Determine all markers hits for each cluster
df_custer_centroids = df_markers_expr.groupby(level='cluster', sort=True, axis=1).mean()
all_markers_in_cluster = [(' // ').join(dict_expressed_markers[cluster].tolist()) for cluster in df_V.columns]
df_V.loc['All markers'] = df_L.loc['All markers'] = all_markers_in_cluster
all_supp_markers = {celltype:df_marker_cell_type.T.index[df_marker_cell_type.T[celltype] > 0].values for celltype in columns_scores}
all_cont_markers = {celltype:df_marker_cell_type.T.index[df_marker_cell_type.T[celltype] < 0].values for celltype in columns_scores}
all_supp_markers[self.nameForUnknown] = np.array([])
all_cont_markers[self.nameForUnknown] = np.array([])
supporting_markers = [(' // ').join((np.intersect1d(all_supp_markers[df_V.loc['Predicted cell type'].loc[cluster].split(' #')[0]],
df_V.loc['All markers'].loc[cluster].split(' // '))).tolist()) for cluster in df_V.columns]
contradicting_markers = [(' // ').join((np.intersect1d(all_cont_markers[df_V.loc['Predicted cell type'].loc[cluster].split(' #')[0]],
df_V.loc['All markers'].loc[cluster].split(' // '))).tolist()) for cluster in df_V.columns]
df_V.loc['Supporting markers'] = df_L.loc['Supporting markers'] = supporting_markers
df_V.loc['Contradicting markers'] = df_L.loc['Contradicting markers'] = contradicting_markers
# Record the results to spreadsheets
fileName = os.path.join(self.saveDir, self.dataName + '_annotation.xlsx')
if self.verbose >= 2:
print('Recording voting results to:', fileName)
writer = pd.ExcelWriter(fileName)
df_L.columns.name = 'cluster'
df_L.T.to_excel(writer, 'z-scores', columns=['Predicted cell type',
'# cells in cluster',
'Winning score',
'Supporting markers',
'Contradicting markers',
'All markers'] + columns_scores)
df_V.columns.name = 'cluster'
df_V.T.to_excel(writer, 'Voting scores', columns=['Predicted cell type',
'# cells in cluster',
'Winning score',
'Supporting markers',
'Contradicting markers',
'All markers'] + columns_scores)
df_custer_centroids.T.to_excel(writer, 'Cluster centroids')
if not df_null_distributions is None:
df_null_distributions.to_excel(writer, 'Null distributions')
df_marker_cell_type.to_excel(writer, 'Marker cell type weight matrix')
writer.save()
return df_V.loc['Predicted cell type'].to_dict()
[docs] def propagateHopfield(cls, sigma = None, xi = None, T = 0.2, tmax = 200, fractionToUpdate = 0.5, mode = 4, meshSamplingRate = 200,
underlyingNetwork = None, typesNames = None, clustersNames = None, printInfo = False,
recordTrajectories = True, id = None, printSwitchingFraction = False, path = None, verbose = 0):
'''Function is used internally to propagate Hopfield network
over a set number of time steps
Parameters:
sigma: pandas.DataFrame, Default None
Markers expression
xi: pandas.DataFrame, Default None
Marker cell type DataFrame
T: float, Default 0.2
Noise (Temperature) parameter
tmax: int, Default 200
Number of step to iterate through
fractionToUpdate: float, Default 0.5
Fraction of nodes to randomly update at each iteration
mode: int, Default 4
Options are:
1: non-onthogonalized, non-weighted attractors
2: onthogonalized, non-weighted attractors
3: onthogonalized, weighted attractors
4: onthogonalized, weighted attractors, asymetric and diluted dynamics
meshSamplingRate: int, Default 100
Visualization parameter to control the quality of the color mesh near the attractors
underlyingNetwork: 2d numpy.array, Default None
Network of underlying connections between genes
typesNames: list-like, Default None
Names of cell types
clustersNames: list-like, Default None
Names or identifiers of the clusters
printInfo: boolean, Default False
Whether to print detailes
recordTrajectories: boolean, Default True
Whether to record trajectories data to files
id: int, Default None
Identifier of this function call
printSwitchingFraction: boolean, Default False
Whether to print fraction of clusters that switch theie
maximum overlapping attractor
path: str, Default None
Path for saving trajectories data
Returns:
2d numpy.array
Overlaps
Usage:
result = propagateHopfield(sigma=sigma, xi=df_attrs)
'''
if xi is None:
if verbose >= 3:
print('xi is None')
return
if not path is None:
if not os.path.exists(path):
os.makedirs(path)
if mode==1 or mode==2:
xi[xi > 0.] = 1.
xi[xi <= 0.] = -1.
if printInfo:
if verbose >= 3:
print('Unique nodes across Hopfield network:', len(np.unique(xi, axis=0)))
if mode == 1:
A = xi.T
J = (xi).dot(A) / (xi.shape[0]) # m,m
else:
Q = (xi.T).dot(xi) # n,n
try:
Q_inv = np.linalg.inv(Q) # n,n
except Exception as exception:
if verbose >= 3:
print(exception)
print('Cannot invert matrix Q')
return
A = (Q_inv).dot(xi.T) # n,m <-- xi_inv
J = (xi).dot(A) # m,m
if not underlyingNetwork is None:
if J.shape == underlyingNetwork.shape:
if verbose >= 3:
print('Using network of underlying interactions')
J *= underlyingNetwork
else:
if verbose >= 3:
print('Network of underlying interactions shape is inconsistent with J. Ignoring it')
getHopfieldEnergy = lambda s: -0.5 * s[None, :].dot(J).dot(s)
if recordTrajectories:
pca = PCA(n_components=None).fit(A)
attrs = pca.transform(A)
variance = pca.explained_variance_ratio_
if id == 0 or id is None:
if xi.shape[1] == 2:
dim = 2
lim = 15
g = np.linspace(-lim, lim, num=100)
mesh_coords = np.meshgrid(*tuple([g]*dim))
mesh_coords = np.vstack([g.flatten() for g in mesh_coords]).T
mesh = np.zeros((mesh_coords.shape[0], xi.shape[1]))
mesh[:, :len(mesh_coords.T)] = mesh_coords
mesh_sigma = pca.inverse_transform(mesh)
if False:
mesh_sigma[mesh_sigma > 0.] = 1.
mesh_sigma[mesh_sigma <= 0.] = -1.
digital_mesh = pca.transform(mesh_sigma)
mesh_coords = digital_mesh
mesh_energy = np.array([getHopfieldEnergy(s) for s in mesh_sigma])
write(np.hstack([mesh_coords[:,:2], mesh_energy]), os.path.join(path, 'mesh'))
else:
if verbose >= 4:
print('Generating samples in the vicinity of attractors')
temp = []
for i in range(xi.shape[1]):
for j in range(25):
st = A[i]
temp.append(np.array([st * ((np.random.rand(st.shape[0]) > 0.025*j) * 2. - 1.) for i in range(meshSamplingRate)]))
mesh_sigma = np.vstack(temp)
if verbose >= 4:
print('Calculating energy of the samples')
if False:
mesh_energy = np.array([getHopfieldEnergy(s) for s in mesh_sigma])
else:
if verbose >= 4:
print('Calculating chunks')
def func(mesh_sigma):
return -0.5 * ((J).dot(mesh_sigma.T) * mesh_sigma.T).sum(axis=0)[:,None]
mesh_energy = np.vstack([func(item) for item in np.split(mesh_sigma, meshSamplingRate)])
if verbose >= 4:
print('\nProjecting samples on PCs')
mesh_coords = pca.transform(mesh_sigma)
if verbose >= 4:
print('Recording mesh data')
write(np.hstack([mesh_coords, mesh_energy]), os.path.join(path, 'mesh'))
if recordTrajectories:
write((np.vstack([attrs, variance]), typesNames), os.path.join(path, 'attrs'))
if sigma is None:
if verbose >= 4:
print('Sigma is None')
return
if recordTrajectories:
trajectories = np.zeros((tmax, sigma.shape[1], xi.shape[1]))
initial = np.argmax((A).dot(sigma), axis=0)
initial[np.where(np.max((A).dot(sigma), axis=0) < 10. ** -12)] = -1
if printInfo:
sIn = sigma.copy()
for t in range(tmax):
h = (J).dot(sigma)
if recordTrajectories:
if id == 0 or id is None:
trajectories[t,:,:] = np.hstack([pca.transform(sigma.T)])
sigmaNew = ((1. / (1. + np.exp(-2. * h / T))) > np.random.rand(*sigma.shape)) * 2. - 1.
# Make dynamics asymmetric and diluted: Deactivate nodes that are being turned off
if mode == 4:
sigmaNew[(sigmaNew - sigma) < -1.] = 0.
f = 1. if t >= 100 else max(0.05, float(t) / 100.)
whereToUpdate = (np.random.rand(*sigma.shape) <= f * fractionToUpdate)
sigma[whereToUpdate] = sigmaNew[whereToUpdate] * np.abs(sigma[whereToUpdate])
if printInfo:
print('In positive:', ((sIn>0)*1.).sum(axis=0))
print('Out positive:', ((sigma>0)*1.).sum(axis=0))
print('Positive common in-out:', (((sIn>0.)*(sigma>0.))*1.).sum(axis=0))
final = np.argmax((A).dot(sigma), axis=0)
final[np.where(np.max((A).dot(sigma), axis=0) < 10. ** -12)] = -1
if printSwitchingFraction:
print(np.round(100. * (final != initial).sum() / (len(initial)), 2).astype(int), end=' ', flush=True)
if recordTrajectories:
if id == 0 or id is None:
write(trajectories, os.path.join(path, 'trajectories%s')%(id))
write((initial, final, typesNames, clustersNames), os.path.join(path, 'additional'))
overlap = (A).dot(sigma)
overlap[overlap <= 0.] = 0.
return overlap
# Other functions of class #########################################################
[docs] @classmethod
def convertColormap(cls, colormap):
'''Convert colormap from the form (1.,1.,1.,1.) to 'rgba(255,255,255,1.)'
Parameters:
colormap: dictionary
Colormap to convert
Returns:
dictionary
Converted colomap
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
colormap = DCS.convertColormap(colormap)
'''
return {k:'rgba' + str(tuple(list((np.array(v[:3]) * 255).astype(int)) + [v[3]])) for k, v in colormap.items()}
[docs] @classmethod
def zScoreOfSeries(cls, se):
'''Calculate z-score of pandas.Series and modify the Series in place
Parameters:
se: pandas.Series
Series to process
Returns:
pandas.Series
Processed series
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
se = DCS.zScoreOfSeries(se)
'''
se.iloc[:] = scipy.stats.zscore(se.values)
return se
[docs] @classmethod
def KeyInFile(cls, key, file):
'''Check is a key exists in a HDF file.
Parameters:
key: str
Key name to check
file: str
HDF file name to check
Returns:
boolean
True if the key is found
False otherwise
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.KeyInFile('df_expr', 'data/file.h5')
'''
if not os.path.isfile(file):
return False
with pd.HDFStore(file) as file:
return True if "/" + key.strip("/") in file.keys() else False
return
[docs] def getSubnetworkOfPCN(self, subnetworkGenes, min_shared_first_targets = 30):
'''Extract subnetwork of PCN network
Parameters:
subnetworkGenes: list-like
Set of genes that the subnetwork should contain
min_shared_first_targets: int, Default 30
Number of minimum first shared targets to connect two nodes
Returns:
pandas.DataFrame
Adjacency matrix
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df_subnetwork = DCS.getSubnetworkOfPCN(genes)
'''
if self.verbose >= 2:
print('\nReading PCN (directed, unweighted) network from file')
PCN = pd.read_csv(os.path.join('data', 'PCN_%s.txt.gz' % (self.species)), compression='gzip', header=0, index_col=[0], delimiter='\t')['Target']
PCN.index = self.gnc.Convert(PCN.index.tolist(), 'alias', 'hugo', returnUnknownString=False)
PCN[:] = self.gnc.Convert(PCN.values.tolist(), 'alias', 'hugo', returnUnknownString=False)
if self.verbose >= 2:
print('Calculating targets of PCN network genes')
targets = PCN.groupby(level=0).agg('unique').apply(set).to_dict()
del PCN
index = pd.Index(set(subnetworkGenes).intersection(set(targets.keys()))).sort_values()
if self.verbose >= 2:
print('Number of PCN genes: %s'%(len(index)))
print('Calculating number of common targets for %s genes'%(len(index)))
data = np.zeros((len(index), len(index))).astype(int)
for iA in range(len(index)):
geneA = index[iA]
for iB in range(iA, len(index)):
geneB = index[iB]
direct = 1 if (geneB in targets[geneA]) else 0
data[iA,iB] = data[iB,iA] = len(targets[geneA].intersection(targets[geneB])) + direct
targetsOverlaps = pd.DataFrame(index=index, columns=index, data=data)
subnetwork = 1.*(targetsOverlaps >= min_shared_first_targets)
if self.verbose >= 2:
print(subnetwork.shape)
subnetwork = subnetwork.reindex(subnetworkGenes, axis=0).reindex(subnetworkGenes, axis=1).fillna(0.)
if self.verbose >= 2:
print(subnetwork.shape)
con = subnetwork.sum().sum()
tot = (len(subnetwork)-1)*len(subnetwork)
if self.verbose >= 2:
print('\nConnections in subnetwork: %s, out of %s possible (%s%%)\n'%(con, tot, np.round(100.*con/tot, 0)))
return subnetwork
[docs] def alignSeries(self, se1, se2, tagForMissing):
'''Align two pandas.Series
Parameters:
se1: pandas.Series
Series with the first set of items
se2: pandas.Series
Series with the second set of items
tagForMissing: str, Default 'Missing'
Label to assign to non-overlapping items
Returns:
pandas.DataFrame
Contains two aligned pandas.Series
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df = DCS.alignSeries(pd.Index(['A', 'B', 'C', 'D']).to_series(), pd.Index(['B', 'C', 'D', 'E', 'F']).to_series())
'''
se1.index.name = 'index'
se2.index.name = 'index'
append = lambda se1, se2: pd.concat([se1, pd.Series(index=se2.index.difference(se1.index), data=[tagForMissing] * len(se2.index.difference(se1.index)))], axis=0, sort=False)
se1 = append(se1, se2)
se2 = append(se2, se1)
se1.name = 'se1'
se2.name = 'se2'
return pd.concat((se1, se2.loc[se1.index]), axis=1, sort=True)
[docs] def createReverseDictionary(self, inputDictionary):
'''Efficient way to create a reverse dictionary from a dictionary.
Utilizes Pandas.Dataframe.groupby and Numpy arrays indexing.
Parameters:
inputDictionary: dictionary
Dictionary to reverse
Returns:
dictionary
Reversed dictionary
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
revDict = DCS.createReverseDictionary(Dict)
'''
keys, values = np.array(list(inputDictionary.keys())), np.array(list(inputDictionary.values()))
df = pd.DataFrame(np.array([[keys[i], value] for i in range(len(keys)) for value in values[i]]))
dfGrouped = df.groupby(df.columns[1])
keys, values = list(dfGrouped.indices.keys()), list(dfGrouped.indices.values())
GOs = df.values.T[0]
return dict(zip(keys, [GOs[value].tolist() for value in values]))
[docs] def readMarkerFile(self, mergeFunction = 'mean', mergeCutoff = 0.25):
'''Read markers file, prepare markers
Parameters:
mergeCutoff: str, Default 'mean'
Function used for grouping of the cell sub-types. Options are:
'mean': average of the values
'max': maxium of the values, effectively a logiacal OR function
mergeCutoff: float, Default 0.25
Values below cutoff are set to zero.
This option is used if mergeCutoff is 'mean'
Returns:
pandas.DataFrame
Celltype/markers matrix
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df_marker_cell_type = DCS.readMarkerFile()
'''
df_marker_cell_type = pd.read_excel(self.geneListFileName, index_col=0, header=[0,1]).replace(np.nan, 0.).replace(0, 0.)
df_marker_cell_type.columns.names = ['CellTypeGrouped', 'CellType']
df_marker_cell_type.index = self.gnc.Convert(list(df_marker_cell_type.index), 'alias', 'hugo', returnUnknownString=False)
df_marker_cell_type = df_marker_cell_type.drop(columns=[col for col in df_marker_cell_type.columns if col[0] == 'NA'])
df_marker_cell_type = df_marker_cell_type.groupby(level='CellTypeGrouped', axis=1).agg(mergeFunction).fillna(0.)
where_positive = df_marker_cell_type >= mergeCutoff
where_negative = df_marker_cell_type <= -mergeCutoff
df_marker_cell_type *= 0.
df_marker_cell_type[where_positive] = 1.
df_marker_cell_type[where_negative] = -1.
df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]
if self.verbose >= 2:
print('Markers/celltypes:', df_marker_cell_type.shape, flush=True)
# Merge duplicates that might have appeared after gene name conversion
df_marker_cell_type = df_marker_cell_type.groupby(level=0, axis=0).sum()
df_marker_cell_type[df_marker_cell_type > 0.] = 1.
df_marker_cell_type[df_marker_cell_type < 0.] = -1.
def norm(s_input):
s = s_input.copy()
pos_sum = np.abs(s.iloc[np.where(s > 0.)].sum())
neg_sum = np.abs(s.iloc[np.where(s < 0.)].sum())
s.iloc[np.where(s > 0.)] /= pos_sum if pos_sum > 0. else 1.
s.iloc[np.where(s < 0.)] /= neg_sum if neg_sum > 0. else 1.
return s
df_marker_cell_type = df_marker_cell_type.apply(norm, axis=0)
if self.verbose >= 1:
print('Markers/celltypes:', df_marker_cell_type.shape, flush=True)
df_marker_cell_type.columns = df_marker_cell_type.columns.str.strip()
if not self.useNegativeMarkers:
df_marker_cell_type[df_marker_cell_type < 0.] = 0.
df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]
if self.verbose >= 2:
print('Removed negative markers. Markers/celltypes:', df_marker_cell_type.shape, flush=True)
return df_marker_cell_type
[docs] def mergeIndexDuplicates(cls, df_expr, method = 'average', printDuplicates = False, verbose = 1):
'''Merge index duplicates
Parameters:
df_expr: pandas.DataFrame
Gene expression table
method: str, Default None
How to deal with index duplicates. Option are:
'average': average values of duplicates
'first': keep only first of duplicates, discard rest
Returns:
pandas.DataFrame
Gene expression table
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
df_expr = DCS.mergeIndexDuplicates(df_expr)
'''
# Check is any names in the index are duplicated, remove duplicates
len_total, len_unique = len(df_expr.index), len(np.unique(df_expr.index))
if len_total != len_unique:
unique_items = np.unique(df_expr.index, return_counts=True)
if method == 'average':
df_expr = df_expr.groupby(level=0, axis=0).sum()
elif method == 'first':
df_expr = df_expr.loc[~df_expr.index.duplicated(keep='first')]
else:
if verbose >= 1:
print('Unknown method')
return df_expr
if verbose >= 1:
print('Merged %s duplicated items in the index of size %s' % (len_total - len_unique, len_total), flush=True)
if printDuplicates:
print(unique_items[0][unique_items[1] > 1], flush=True)
return df_expr
[docs] def recordExpressionData(self):
'''Record expression data from the internal HDF storage.
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.recordExpressionData()
'''
if self.verbose >= 1:
print('Recording compressed DataFrame', flush=True)
self._df_expr.replace(0, np.nan).T.stack().to_frame().to_hdf(self.fileHDFpath, key='df_expr', mode='a', complevel=4, complib='zlib')
return
[docs] def loadAnnotatedLabels(self, detailed = False, includeLowQC = True, infoType = 'label'):
'''Load cell annotations resulted from function 'annotate'
Parameters:
detailed: boolean, Default False
Whether to give cluster- or celltype- resolution data
includeLowQC: boolean, Default False
Whether to include low quality cells in the output
Returns:
pandas.Series
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.loadAnnotatedLabels()
'''
if self.KeyInFile('df_markers_expr', self.fileHDFpath):
try:
se = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').T.reset_index().set_index(['batch', 'cell'])['label']
except Exception as exception:
if self.verbose >= 1:
print(exception)
print('Error reading annotation results', flush=True)
return
if not detailed:
se = se.str.split(' #', expand=True)[0]
else:
if self.verbose >= 1:
print('Annotation results not found', flush=True)
return
if includeLowQC:
if self.KeyInFile('df_projection_pre_QC', self.fileHDFpath):
allCells = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC').columns
allBathes = allCells.get_level_values('batch')
elif self.KeyInFile('df_projection', self.fileHDFpath):
allCells = pd.read_hdf(self.fileHDFpath, key='df_projection').columns
allBathes = allCells.get_level_values('batch')
else:
if self.verbose >= 1:
print('Labelled data not found', flush=True)
return
if infoType=='label':
se = se.reindex(allCells, fill_value=self.nameForLowQC)
elif infoType=='batch':
se_low = pd.Series(data=allBathes, index=allCells)
se.iloc[:] = se.index.get_level_values('batch').values
se = pd.concat([se, se_low.loc[se_low.index.difference(se.index)]])
return se
[docs] def loadExpressionData(self):
'''Load processed expression data from the internal HDF storage.
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.loadExpressionData()
'''
if self.KeyInFile('df_expr', self.fileHDFpath):
if self.verbose >= 2:
print('Loading processed data', flush=True)
self._df_expr = pd.read_hdf(self.fileHDFpath, key='df_expr').unstack(level=-1, fill_value=0.).T
self._df_expr.index = self._df_expr.index.get_level_values(-1)
self._df_expr.sort_index(inplace=True)
else:
if self.verbose >= 1:
print('Processed data not found')
return
[docs] def calculateQCmeasures(self):
'''Calculate Quality Control (QC) measures
Parameters:
None
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.calculateQCmeasures()
'''
if self.mitochondrialGenes is None:
if self.species == 'Human':
mitoGenesPath = os.path.join(self.defaultGeneListsDir, 'Human.MitoCarta2.0.csv')
elif self.species == 'Mouse':
mitoGenesPath = os.path.join(self.defaultGeneListsDir, 'Mouse.MitoCarta2.0.csv')
else:
if self.verbose >= 1:
print('Mitochondrial genes for scpecies %s not found' % (self.species))
print('Implemented lists are for Human and Mouse')
raise NotImplementedError
self.mitochondrialGenes = pd.read_csv(mitoGenesPath, index_col=None, header=0)['Symbol'].values.squeeze().tolist()
if self.verbose >= 2:
print('Calculating quality control measures (count depth, number of genes, fraction of mitochondrial genes) for each cell', flush=True)
df_QC = pd.concat([(self._df_expr).sum(axis=0),
(self._df_expr > 0).sum(axis=0),
(self._df_expr.loc[self._df_expr.index.intersection(self.mitochondrialGenes)] > 0).sum(axis=0) / \
(self._df_expr > 0).sum(axis=0)], axis=1, sort=False)
df_QC.columns = ['count_depth', 'number_of_genes', 'fraction_of_mitochondrialGenes']
df_QC.to_hdf(self.fileHDFpath, key='QC', mode='a', complevel=4, complib='zlib')
return
[docs] def qualityControl(self, **kwargs):
'''Remove low quality cells
Parameters:
None
Returns:
Any parameters that function 'getIndexOfGoodQualityCells' can accept
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.qualityControl()
'''
index = self.getIndexOfGoodQualityCells(**kwargs)
if self.toggleRemoveLowQualityCells:
self._df_expr = self._df_expr[self._df_expr.columns.intersection(index).sort_values()]
self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]
if self.verbose >= 1:
print('Removed low quality cells. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)
df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC')
df_projection[self._df_expr.columns].to_hdf(self.fileHDFpath, key='df_projection', mode='a', complevel=4, complib='zlib')
self.project(PCAonly=True)
return
[docs] def batchEffectCorrection(self, method = 'COMBAT'):
'''Batch effect correction.
Parameters:
method: str, Default 'COMBAT'
Stein, C.K., Qu, P., Epstein, J. et al. Removing batch effects from purified
plasma cell gene expression microarrays with modified ComBat.
BMC Bioinformatics 16, 63 (2015)
Returns:
None
Usage:
DCS = DigitalCellSorter.DigitalCellSorter()
DCS.batchEffectCorrection()
'''
if method == 'COMBAT':
cells = self._df_expr.columns.get_level_values('cell').values.copy()
patients = self._df_expr.columns.get_level_values('batch').values.copy()
if len(np.unique(patients)) == 1:
if self.verbose >= 2:
print('Only one batch provided. Batch correction is not necessary', flush=True)
else:
if self.verbose >= 1:
print('ComBat transformation', flush=True)
if self.verbose >= 2:
print('Reading positions of zeros', flush=True)
where_zeros = self._df_expr == 0.
values = combat(pd.DataFrame(data=self._df_expr.values.copy(), index=self._df_expr.index.values.copy(), columns=range(len(cells))), pd.Series(data=patients, index=range(len(cells)))).values
self._df_expr = pd.DataFrame(data=values, index=self._df_expr.index, columns=self._df_expr.columns)
if self.verbose >= 2:
print('Setting original zeros back to zeros', flush=True)
self._df_expr[where_zeros] = 0.
if self.verbose >= 2:
print('Setting negative values to zeros', flush=True)
self._df_expr[self._df_expr < 0.0] = 0.
else:
if self.verbose >= 1:
print('Batch effect correction method unknown')
return