Source code for DigitalCellSorter.core

'''The main class of DigitalCellSorter. The class includes tools for:

  1. **Pre-preprocessing** of single cell RNA sequencing data

  2. **Quality control**

  3. **Batch effects correction**

  4. **Cells anomaly score evaluation**

  5. **Dimensionality reduction**

  6. **Clustering**

  7. **Annotation of cell types**

  8. **Vizualization**
  
  9. **Post-processing**

'''

import os
import platform
import copy
import multiprocessing
import warnings

multiprocessing.set_start_method('spawn', force=True)

import numpy as np
import pandas as pd

import scipy.stats
import scipy.signal
import scipy.linalg
import scipy.cluster.hierarchy
import scipy.spatial.distance
from scipy.interpolate import UnivariateSpline

import sklearn.metrics
import sklearn.cluster
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering, KMeans, SpectralCoclustering
from sklearn.ensemble import IsolationForest

from . import GeneNameConverter, geneLists
from .Combat import combat
from .VisualizationFunctions import VisualizationFunctions
from .GenericFunctions import read, write
from .VisualizationFunctions import cm

[docs]def readMTXdata(dataDir, origin, fileMatrix = None, fileBarcodes = None, fileGenes = None, headerRows = None, sampleName = None, stripGeneVersions = True, saveData = True, dropGeneDuplicates = True, dropCellDuplicates = True):

    '''Read MTX format into pandas DataFrame compatible with DCS input format

    Parameters
        dataDir: str
            Path to gene expression counts data

        origin: str
            Name of the software where the data was generated. Supported options are:
                'kb-python' for kallisto-bustools
                'cellranger' for cellRanger

        fileMatrix: str, Default None
            Name of the matrix file

        fileBarcodes: str, Default None
            Name of the cell barcodes file

        fileGenes: str, Default None
            Name of the genes file

        headerRows: list, Default None
            List of rows in matrix file to skip

        sampleName: str, Default None
            Name of the data sample to include in the batch level

        stripGeneVersions: boolean, Default True
            Remove ensembl gene version. E.g. "ENSG00000236246.1" --> "ENSG00000236246"

        saveData: boolean, Default True
            Whether to save data in hdf format. If True then the data is saved to
            a compressed hdf at the same location as matrix data

        dropGeneDuplicates: boolean, Default True
            Whether to remove gene duplicates (keep first)

        dropGeneDuplicates: boolean, Default True
            Whether to remove barcode duplicates (keep first)

    Returns:
        pandas.DataFrame
            Table that has genes in rows and cells in columns

    Usage:
        df = readMTX(dataDir='filtered_feature_bc_matrix/', origin='cellranger')        
        #df = readMTX(dataDir='counts_filtered/', origin='kb-python')

        DCS = DigitalCellSorter.DigitalCellSorter()
        DCS.prepare(df)
        DCS.convert('ensembl', 'hugo')
        print(DCS.df_expr)
    '''

    if origin == 'kb-python':
        if headerRows is None:
            headerRows = [0, 1, 2]

        genes_level = 0

        if fileMatrix is None:
            fileMatrix='cells_x_genes.mtx'

        if fileBarcodes is None:
            fileBarcodes='cells_x_genes.barcodes.txt'

        if fileGenes is None:
            fileGenes='cells_x_genes.genes.txt'

    elif origin == 'cellranger':
        if headerRows is None:
            headerRows = [0, 1, 2]

        genes_level = 1

        if fileMatrix is None:
            fileMatrix='matrix.mtx.gz'

        if fileBarcodes is None:
            fileBarcodes='barcodes.tsv.gz'

        if fileGenes is None:
            fileGenes='features.tsv.gz'

    df_expr = pd.read_csv(os.path.join(dataDir, fileMatrix), delimiter=' ', index_col=[0,1], header=None, skiprows=headerRows)[2].astype(int)
    df_expr = df_expr.unstack(genes_level, fill_value=0)
    df_expr.index = pd.read_csv(os.path.join(dataDir, fileGenes), delimiter='\t', index_col=None, header=None)[0].loc[df_expr.index - 1].values
    df_expr.columns = pd.read_csv(os.path.join(dataDir, fileBarcodes), delimiter='\t', index_col=0, header=None).index

    if stripGeneVersions:
        df_expr.index = df_expr.index.str.split('.', expand=True).get_level_values(0)

    if dropGeneDuplicates:
        df_expr = df_expr.loc[~df_expr.index.duplicated(keep='first')]

    if dropCellDuplicates:
        df_expr = df_expr.T.loc[~df_expr.T.index.duplicated(keep='first')].T

    df_expr.index.name = 'gene'
    df_expr.columns.name = 'cell'

    if not sampleName is None:
        df_expr = pd.concat([df_expr], axis=1, keys=[sampleName], names=['batch'], sort=False)

    if saveData:
        df_expr.to_hdf(os.path.join(dataDir, 'data.h5'), key='df', mode='a', complevel=4, complib='zlib')

    return df_expr

[docs]class DigitalCellSorter(VisualizationFunctions):

    '''Class of Digital Cell Sorter with methods for processing single cell 
    RNA-seq data. Includes analyses and visualization tools.

    Parameters:
        df_expr: pandas.DataFrame, Defauld None
            Gene expression in a form of a table, where genes are rows, and
            cells/batches are columns

        dataName: str, Default 'dataName'
            Name used in output files

        geneNamesType: str, Default 'alias'
            Input gene name convention

        geneListFileName: str, Default None
            Name of the marker genes file

        mitochondrialGenes: list, Default None
            List of mitochondrial genes to use in quality control

        sigmaOverMeanSigma: float, Default 0.1
            Threshold to consider a gene constant

        nClusters: int, Default 10
            Number of clusters

        nFineClusters: int, Default 3
            Number of fine clusters to determine with Spectral Co-clustering routine.
            This option is ignored is doFineClustering is False.

        doFineClustering: boolean, Default True
            Whether to do fine clustering or not

        minSizeForFineClustering: int, Default 50
            Minimum number of cells required to do fine clustering of a cluster.
            This option is ignored is doFineClustering is False.

        clusteringFunction: function, Default AgglomerativeClustering
            Clustering function to use.
            Other options: KMeans, {k_neighbors:40}, etc.
            Note: the function should have .fit method and same input and output.
            For Network-based clustering pass a dictionary 
            {'k_neighbors':40, metric:'euclidean', 'clusterExpression':True},
            this way the best number of clusters will be determined automatically

        nComponentsPCA: int, Default 200
            Number of pca components

        nSamples_pDCS: int, Default 3000
            Number of random samples in distribution for pDCS annotation method

        nSamples_Hopfield: int, Default 500
            Number of repetitions for Hopfield annotation method

        saveDir: str, Default os.path.join('')
            Directory for output files

        makeMarkerSubplots: boolean, Default False
            Whether to make subplots on markers

        makePlots: boolean, Default True
            Whether to make all major plots

        availableCPUsCount: int, Default min(12, os.cpu_count())
            Number of CPUs used in pDCS method

        zScoreCutoff: float, Default 0.3
            Z-Score cutoff when setting expression of a cluster as significant

        thresholdForUnknown: float, Default 0.3
            Threshold when assigning label "Unknown".
            This option is used only with a combination of 2 or more annotation methods

        thresholdForUnknown_pDCS: float, Default 0.1
            Threshold when assigning label "Unknown" in pDCS method
            
        thresholdForUnknown_ratio: float, Default 0.1
            Threshold when assigning label "Unknown" in ratio method
            
        thresholdForUnknown_Hopfield: float, Default 0.1
            Threshold when assigning label "Unknown" in Hopfield method

        annotationMethod: str, Default 'ratio-pDCS-Hopfield'
            Metod to use for annotation of cell types to clusters. Options are:
                'pDCS': main DCS voting scheme with null testing

                'ratio': simple voting score

                'Hopfield': Hopfield Network classifier

                'pDCS-ratio': 'pDCS' adjusted with 'ratio'

                'pDCS-Hopfield': 'pDCS' adjusted with 'Hopfield'

                'ratio-Hopfield': 'ratio' adjusted with 'Hopfield'

                'pDCS-ratio-Hopfield': 'pDCS' adjusted with 'ratio' and 'Hopfield'

        subclusteringName: str, Default None
            Parameter used in for certain labels on plots

        doQualityControl: boolean, Default True
            Whether to remove low quality cells

        doBatchCorrection: boolean, Default False
            Whether to correct data for batches

        minimumNumberOfMarkersPerCelltype: int, Default 10
            Minimum number of markers per cell type to keep that cell type in annotation options

        nameForUnknown: str, Default 'Unassigned'
            Name to use for clusters where label assignment yielded uncertain results

        nameForLowQC: str, Default 'Failed QC'
            Name to use for cell that do not pass quality control

        layout: str, Default 'TSNE'
            Projection layout used in visualization. Options are:
                'TSNE': t-SNE layout
                L.J.P. van der Maaten. Accelerating t-SNE using Tree-Based Algorithms. 
                Journal of Machine Learning Research 15(Oct):3221-3245, 2014.

                'PCA': use two largest principal components

                'UMAP': use uniform manifold approximation,
                McInnes, L., Healy, J., UMAP: Uniform Manifold Approximation and Projection for 
                Dimension Reduction, ArXiv e-prints 1802.03426, 2018

                'PHATE': use potential of heat diffusion for affinity-based transition embedding,
                Moon, K.R., van Dijk, D., Wang, Z. et al. Visualizing structure and 
                transitions in high-dimensional biological data. Nat Biotechnol 37, 1482–1492 (2019). 
    
    Usage:
        DCS = DigitalCellSorter.DigitalCellSorter()

        df_data = DCS.Clean(df_data)
    '''

    def __init__(self, df_expr = None, dataName = 'dataName', species = 'Human', geneNamesType = 'alias', geneListFileName = None, 
                 mitochondrialGenes = None, sigmaOverMeanSigma = 0.01, nClusters = 10, nFineClusters = 3, doFineClustering = True, 
                 splitFineClusters = False, subSplitSize = 100, medianScaleFactor = 10000,
                 minSizeForFineClustering = 50, clusteringFunction = AgglomerativeClustering, nComponentsPCA = 200, nSamples_pDCS = 3 * 10 ** 3, 
                nSamples_Hopfield = 200, saveDir = os.path.join(''), makeMarkerSubplots = False, availableCPUsCount = min(6, os.cpu_count()), 
                zScoreCutoff = 0.3, subclusteringName = None, doQualityControl = True, doBatchCorrection = False, makePlots = True, 
                useUnderlyingNetwork = True, minimumNumberOfMarkersPerCelltype = 10, nameForUnknown = 'Unassigned', nameForLowQC = 'Failed QC',
                matplotlibMode = 'Agg', countDepthCutoffQC = 0.5, numberOfGenesCutoffQC = 0.5, mitochondrialGenesCutoffQC = 1.5, 
                excludedFromQC = None, countDepthPrecutQC = 500, numberOfGenesPrecutQC = 250, precutQC = False, minSubclusterSize = 25,
                thresholdForUnknown_pDCS = 0., thresholdForUnknown_ratio = 0., thresholdForUnknown_Hopfield = 0., thresholdForUnknown = 0.2, 
                layout = 'TSNE', safePlotting = True, HopfieldTemperature = 0.1, annotationMethod = 'ratio-pDCS-Hopfield',
                useNegativeMarkers = True, removeLowQualityScores = True, updateConversionDictFile = True, verbose = 1,
                random_state = None):

        '''Initialization function that is automatically called when an instance on Digital Cell Sorter is created'''

        self.verbose = verbose

        self.gnc = GeneNameConverter.GeneNameConverter(dictDir=os.path.join(os.path.dirname(__file__)), 
                                                       species=species, updateConversionDictFile=updateConversionDictFile, verbose=verbose)

        self.species = species
        self.dataName = dataName
        self.saveDir = saveDir
        self.safePlotting = safePlotting
        self.random_state = random_state

        if not df_expr is None:
            self.prepare(df_expr)
        else:
            self._df_expr = None

        self.defaultGeneListFileName = 'CIBERSORT_LM22'
        self.defaultGeneListsDir = os.path.join(os.path.dirname(__file__), 'geneLists')
        self.geneListFileName = geneListFileName
        self.mitochondrialGenes = mitochondrialGenes

        self.useNegativeMarkers = useNegativeMarkers
        self.removeLowQualityScores = removeLowQualityScores

        self.geneNamesType = geneNamesType

        self.countDepthCutoffQC = countDepthCutoffQC
        self.numberOfGenesCutoffQC = numberOfGenesCutoffQC
        self.mitochondrialGenesCutoffQC = mitochondrialGenesCutoffQC
        self.countDepthPrecutQC = countDepthPrecutQC
        self.numberOfGenesPrecutQC = numberOfGenesPrecutQC
        self.precutQC = precutQC
        self.excludedFromQC = excludedFromQC

        self.medianScaleFactor = medianScaleFactor

        self.minSubclusterSize = minSubclusterSize
        self.sigmaOverMeanSigma = sigmaOverMeanSigma
        self.nClusters = nClusters
        self.doFineClustering = doFineClustering
        self.splitFineClusters = splitFineClusters
        self.subSplitSize = subSplitSize
        self.nFineClusters = nFineClusters
        self.minSizeForFineClustering = minSizeForFineClustering
        self.nComponentsPCA = nComponentsPCA
        self.zScoreCutoff = zScoreCutoff
        self.minimumNumberOfMarkersPerCelltype = minimumNumberOfMarkersPerCelltype

        self.useUnderlyingNetwork = useUnderlyingNetwork

        self.annotationMethod = annotationMethod

        self.nameForUnknown = nameForUnknown
        self.nameForLowQC = nameForLowQC

        self.layout = layout

        self.thresholdForUnknown_pDCS = thresholdForUnknown_pDCS
        self.thresholdForUnknown_ratio = thresholdForUnknown_ratio
        self.thresholdForUnknown_Hopfield = thresholdForUnknown_Hopfield
        self.thresholdForUnknown = thresholdForUnknown

        self.HopfieldTemperature = HopfieldTemperature

        self.nSamples_pDCS = nSamples_pDCS
        self.nSamples_Hopfield = nSamples_Hopfield

        self.availableCPUsCount = availableCPUsCount

        self.clusteringFunction = clusteringFunction

        self.subclusteringName = subclusteringName

        self.doQualityControl = doQualityControl
        self.toggleRemoveLowQualityCells = doQualityControl
        self.toggleDoBatchCorrection = doBatchCorrection

        self.toggleMakeHistogramNullDistributionPlot = False
        self.toggleMakeVotingResultsMatrixPlot = makePlots
        self.toggleMakeMarkerExpressionPlot = makePlots
        self.toggleMakeProjectionPlotsQC = makePlots
        self.toggleMakeProjectionPlotClusters = makePlots
        self.toggleMakeProjectionPlotAnnotatedClusters = makePlots
        self.toggleMakeProjectionPlotBatches = makePlots
        self.toggleMakeStackedBarplot = makePlots

        self.toggleAnomalyScoresProjectionPlot = False

        self.toggleGetMarkerSubplots = makeMarkerSubplots

        super().__init__(saveDir=saveDir, dataName=dataName, safePlotting=safePlotting, matplotlibMode=matplotlibMode, verbose=verbose)

        return None
    
    @property
    def saveDir(self):

        return self._saveDir

    @saveDir.setter
    def saveDir(self, value):

        self._saveDir = value

        if self.saveDir != os.path.join('') and not os.path.exists(self.saveDir):
            os.makedirs(self.saveDir)

        self._fileHDFpath = os.path.join(self.saveDir, self.dataName + '_processed.h5')

        return

    @property
    def fileHDFpath(self):

        return self._fileHDFpath
    
    @fileHDFpath.setter
    def fileHDFpath(self, value):

        warnings.warn("Direct change of HDF file path is not allowed. " + 
                      "Modify 'saveDir' in order to change 'fileHDFpath'", UserWarning)

        return

    @property
    def df_expr(self):

        if self._df_expr is None:

            if self.verbose >= 1:
                print('Expression data is not assigned')

        return self._df_expr
    
    @df_expr.setter
    def df_expr(self, value):

        if value is None:
            self._df_expr = value
        else:
            warnings.warn("Direct change of 'df_expr' is not allowed. " + 
                          "Use function 'prepare' in order to set 'df_expr' " +
                          "or, alternatively, initialize class instance with 'df_expr'. " +
                          "'df_expr' can be removed by assigning None to it", UserWarning)

        return

    @property
    def geneListFileName(self):

        return self._geneListFileName

    @geneListFileName.setter
    def geneListFileName(self, value):

        if value is None:
            self._geneListFileName = os.path.join(self.defaultGeneListsDir, self.defaultGeneListFileName + '.xlsx')
        else:
            if os.path.isfile(value):
                self._geneListFileName = value
            else:
                if os.path.isfile(os.path.join(self.defaultGeneListsDir, value + '.xlsx')):
                    self._geneListFileName = os.path.join(self.defaultGeneListsDir, value + '.xlsx')
                else:
                    if self.verbose >= 1:
                        print('Gene list file not found. Uning default list: %s'%(self.defaultGeneListFileName), flush=True)

                    self._geneListFileName = os.path.join(self.defaultGeneListsDir, self.defaultGeneListFileName + '.xlsx')

        return


    # Primary functions of class #######################################################
[docs]    def prepare(self, obj):

        '''Prepare pandas.DataFrame for input to function process()
        If input is pd.DataFrame validate the input whether it has correct structure.

        Parameters:
            obj: str, pandas.DataFrame, pandas.Series
                Expression data in a form of pandas.DataFrame, pandas.Series, or name and path to a csv file with data

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            dDCS.preapre('data.csv')
        '''

        if type(obj) is pd.Series:
            if self.verbose >= 1:
                print('Received data in a form of pandas.Series', flush=True)
                print('Validating it and converting it to pandas.DataFrame', flush=True)

            if not 'cell' in obj.index.names:
                if self.verbose >= 1:
                    print('Column "cell" not found. Returning None', flush=True)
                return None

            if not 'gene' in obj.index.names:
                if self.verbose >= 1:
                    print('Column "gene" not found. Returning None', flush=True)
                return None

            if not 'batch' in obj.index.names:
                print('Column "batch" not found. Assuming one batch in the data.', flush=True)
                batch = np.array(['batch0'] * len(obj.index.get_level_values('cell')))
            else:
                batch = obj.index.get_level_values('batch')

            obj.index = pd.MultiIndex.from_arrays([batch, obj.index.get_level_values('cell'), obj.index.get_level_values('gene')], names=['batch', 'cell', 'gene'])

            obj = obj.unstack(level='gene').T

            self._df_expr = obj

            if self.verbose >= 2:
                print('Done', flush=True)

            return None

        elif type(obj) is pd.DataFrame:
            if self.verbose >= 1:
                print('Received data in a form of pandas.DataFrame', flush=True)
                print('Validating pandas.DataFrame', flush=True)

            try:
                obj.index.name = 'gene'
            except Exception as exception:
                if self.verbose >= 1:
                    print(exception)
                    print('DataFrame index format is not understood. Returning None', flush=True)
                return None

            if len(obj.columns.names) > 1:
                if not 'cell' in obj.columns.names:
                    if self.verbose >= 1:
                        print('Columns level "cell" not found. Returning None', flush=True)
                    return None
            else:
                obj.columns.names = ['cell']

            if not 'batch' in obj.columns.names:
                if self.verbose >= 2:
                    print('Columns level "batch" not found. Assuming one batch in the data.', flush=True)
                batch = np.array(['batch0'] * len(obj.columns.get_level_values('cell')))
            else:
                batch = obj.columns.get_level_values('batch')

            obj.columns = pd.MultiIndex.from_arrays([batch, obj.columns.get_level_values('cell')], names=['batch', 'cell'])

            self._df_expr = obj

            if self.verbose >= 2:
                print('Done', flush=True)

            return None

        elif type(obj) is str:
            columns = pd.read_csv(obj, header=0, index_col=None, nrows=5).columns.values.tolist()

            if ('cell' in columns) and ('gene' in columns) and ('expr' in columns):
                if self.verbose >= 1:
                    print('Received data in a form of condensed matrix. Reading data', flush=True)
                df_expr = pd.read_csv(obj, header=0, index_col=None)

                if self.verbose >= 2:
                    print('Converting it to pandas.DataFrame')

                if not 'cell' in df_expr.columns:
                    if self.verbose >= 1:
                        print('The column with "cell" identifiers is not found. Returning None', flush=True)
                    return None

                if not 'gene' in df_expr.columns:
                    if self.verbose >= 1:
                        print('The column with "gene" identifiers is not found. Returning None', flush=True)
                    return None

                if not 'expr' in df_expr.columns:
                    if self.verbose >= 1:
                        print('The column with expression values is not found. Returning None', flush=True)
                    return None

                if not 'batch' in df_expr.columns:
                    if self.verbose >= 2:
                        print('The column with "batch" identifiers is not found. Assuming one batch in the data', flush=True)
                    df_expr['batch'] = np.array(['batch0'] * len(df_expr))

                self._df_expr = df_expr.set_index(['batch', 'cell', 'gene'])['expr'].unstack(level='gene').T

                if self.verbose >= 2:
                    print('Done', flush=True)

                return None
            else:
                if self.verbose >= 1:
                    print('Received data in a form of matrix. Reading data', flush=True)
                df_expr = pd.read_csv(obj, header=None, index_col=0)

                if self.verbose >= 2:
                    print('Converting it to pandas.DataFrame', flush=True)

                if not 'cell' in df_expr.index:
                    if self.verbose >= 1:
                        print('The row with "cell" identifiers is not found. Returning None', flush=True)
                    return None

                if not 'batch' in df_expr.index:
                    if self.verbose >= 2:
                        print('The row with "batch" identifiers is not found. Assuming one batch in the data', flush=True)
                    df_expr.loc['batch'] = np.array(['batch0'] * df_expr.shape[1])

                df_expr = df_expr.T.set_index(['batch', 'cell']).T

                df_expr.index.name = 'gene'

                self._df_expr = df_expr

                if self.verbose >= 2:
                    print('Done', flush=True)

                return None
        else:
            if self.verbose >= 1:
                print('Unknown input data format. Returning None', flush=True)

            return None

        return None

[docs]    def convert(self, nameFrom = None, nameTo = None, **kwargs):

        '''Convert index to hugo names, if any names in the index are
        duplicated, remove duplicates

        Parameters:
            nameFrom: str, Default 'alias'
                Gene name type to convert from

            nameTo: str, Default 'hugo'
                Gene name type to convert to

            Any parameters that function 'mergeIndexDuplicates' can accept

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.convertIndex()
        '''

        if nameFrom is None:
            nameFrom = self.geneNamesType

        if nameTo is None:
            nameTo = 'hugo'

        if nameTo != nameFrom:
            self._df_expr.index = self.gnc.Convert(list(self._df_expr.index), nameFrom, nameTo, returnUnknownString=False)

            self._df_expr = self.mergeIndexDuplicates(self.df_expr, verbose=self.verbose, **kwargs)

        return None

[docs]    def clean(self):

        '''Clean pandas.DataFrame: validate index, remove index duplicates,
        replace missing with zeros, remove all-zero rows and columns

        Parameters:
            None

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.clean()
        '''

        # Replace any NaN(s) with zeros.
        self._df_expr.fillna(0.0, inplace=True)
        if self.verbose >= 1:
            print('Replaced missing values with zeros. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)

        # Check is any names in the index are duplicated, remove duplicates
        self._df_expr = self.mergeIndexDuplicates(self._df_expr) 

        # Keep only cells with at least one expressed gene.
        self._df_expr = self._df_expr.T[self._df_expr.sum(axis=0) > 0].T
        if self.verbose >= 1:
            print('Removed all-zero cells. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)

        # Keep only genes expressed in at least one cell.
        self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]
        if self.verbose >= 1:
            print('Removed all-zero genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)

        return None

[docs]    def normalize(self, median = None):

        '''Normalize pandas.DataFrame: rescale all cells, log-transform data,
        remove constant genes, sort index

        Parameters:
            median: float, Default None
                Scale factor, if not provided will be computed as median across all cells in data

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.normalize()
        '''

        # Scale all cells.
        median = np.median(np.sum(self._df_expr,axis=0)).astype(float) if median is None else median
        if self.verbose >= 1:
            print('Rescaling all cells by "sum of values = %s".' % (median), flush=True)
        self._df_expr = self._df_expr.apply(lambda q: q * median / np.sum(q),axis=0)

        if self.verbose >= 1:
            print('Log-transforming data.', flush=True)
        # Replace zeros with minimum value.
        MIN = np.min(self._df_expr.values[self._df_expr.values > 0.])
        if MIN <= 0.:
            raise ValueError
        self._df_expr = self._df_expr.replace(0., MIN)

        # Take log2 of expression.
        self._df_expr = np.log2(self._df_expr)
        self._df_expr -= np.min(self._df_expr.values)

        # Keep only genes expressed in at least one cell.
        self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]
        if self.verbose >= 1:
            print('Removed all-zero genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)  

        if not self.sigmaOverMeanSigma is None:
            # Keep only those genes with large enough standard deviation.
            self._df_expr = self._df_expr[np.std(self._df_expr, axis=1) / np.mean(np.std(self._df_expr.values)) > self.sigmaOverMeanSigma]
            if self.verbose >= 1:
                print('Removed constant genes. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)

        # Sort rows by gene name
        self._df_expr = self._df_expr.sort_index()

        return None
    
[docs]    def project(self, PCAonly = False, do_fast_tsne = True):

        '''Project pandas.DataFrame to lower dimensions

        Parameters:
            PCAonly: boolean, Default False
                Perform Principal component analysis only

            do_fast_tsne: boolean, Default True
                Do FI-tSNE instead of "exact" tSNE
                This option is ignored if layout is not 'TSNE'

        Returns:
            tuple
                Processed data
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            xPCA, PCs, tSNE = DCS.project()
        '''

        if self.verbose >= 1:
            print('Preparing xpca, PCs, 2D projection of df_expr', flush=True)

        if self.verbose >= 2:
            print('Performing PC projection from %s to %s features...' % (self._df_expr.shape[0], self.nComponentsPCA), flush=True)
        _PCA = PCA(n_components=self.nComponentsPCA, random_state=self.random_state)

        idx = np.argsort(np.var(self._df_expr.values.T, axis=0) / np.mean(self._df_expr.values.T, axis=0))[-2000:]
        X_pca = _PCA.fit_transform(self._df_expr.values.T[:, idx]).T

        if self.verbose >= 2:
            print('Explained variance:', np.round(np.sum(_PCA.explained_variance_ratio_) * 100., 2), "%", flush=True)
            print('Recording xpca, PCs, 2D projection of df_expr', flush=True)

        columns=self._df_expr.columns
        columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'), columns.get_level_values('cell')], names=['batch', 'cell'])
        pd.DataFrame(data=X_pca, columns=columns).to_hdf(self.fileHDFpath, key='df_xpca', mode='a', complevel=4, complib='zlib')
        pd.DataFrame(data=_PCA.components_).to_hdf(self.fileHDFpath, key='df_pcs', mode='a', complevel=4, complib='zlib')

        if not PCAonly:
            if self.layout == 'TSNE':
                if X_pca.T.shape[0] < 2000:
                    do_fast_tsne = False

                if self.verbose >= 1:
                    print('Performing tSNE projection from %s to %s features...' % (self.nComponentsPCA,2), flush=True)
                if do_fast_tsne:
                    if platform.system() == "Windows":
                        from .FastTSNE import fast_tsne
                        if self.verbose >= 3:
                            print('Windows detected. Using FastTSNE submodule wrapper', flush=True)
                        X_projection2 = fast_tsne(X_pca.T, perplexity = 30, seed=42).T
                    else:
                        try:
                            import fitsne
                        except Exception as exception:
                            if self.verbose >= 1:
                                print(exception)
                                print('FI-tSNE package could not be imported. Install the package following \
                                instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
                            self.layout = 'PCA'

                        if self.layout == 'TSNE':
                            X_projection2 = fitsne.FItSNE(np.array(X_pca.T, order='C')).T
                else:
                    X_projection2 = TSNE(n_components=2).fit_transform(X_pca.T).T

            elif self.layout == 'UMAP':
                if self.verbose >= 1:
                    print('Performing UMAP projection from %s to %s features...' % (self.nComponentsPCA, 2), flush=True)
                try:
                    import umap
                except Exception as exception:
                    if self.verbose >= 1:
                        print(exception)
                        print('UMAP package could not be imported. Install the package following \
                    instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
                    self.layout = 'PCA'

                if self.layout == 'UMAP':
                    X_projection2 = umap.UMAP(random_state=self.random_state).fit_transform(X_pca.T).T

            elif self.layout == 'PHATE':
                if self.verbose >= 1:
                    print('Performing PHATE projection from %s to %s features...' % (self.nComponentsPCA, 2), flush=True)
                try:
                    import phate
                except Exception as exception:
                    if self.verbose >= 1:
                        print(exception)
                        print('PHATE package could not be imported. Install the package following instructions of DigitalCellSorter installation on GitHub. Defaulting to PCA layout')
                    self.layout = 'PCA'

                if self.layout == 'PHATE':
                    X_projection2 = phate.PHATE().fit_transform(X_pca.T).T

            if self.layout == 'PCA':
                if self.verbose >= 1:
                    print('Using PC1 and PC2 for layout', flush=True)
                X_projection2 = X_pca[[0,1],:]

            if self.verbose >= 2:
                print('Recording 2D projection of df_expr', flush=True)

            pd.DataFrame(data=X_projection2, columns=columns).to_hdf(self.fileHDFpath, key='df_projection_pre_QC', mode='a', complevel=4, complib='zlib')
            pd.DataFrame(data=X_projection2, columns=columns).to_hdf(self.fileHDFpath, key='df_projection', mode='a', complevel=4, complib='zlib')

            return X_pca, _PCA.components_, X_projection2

        return X_pca, _PCA.components_
    
[docs]    def cluster(self):

        '''Cluster PCA-reduced data into a desired number of clusters

        Parameters:
            None

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.cluster()
        '''

        if self.verbose >= 1:
            print('Calculating clustering of PCA data', flush=True)
        df_xpca = pd.read_hdf(self.fileHDFpath, key='df_xpca')

        if type(self.clusteringFunction) is dict:
            try:
                import pynndescent
                import networkx as nx
                import community
            except Exception as exception:
                if self.verbose >= 1:
                    print(exception)
                    print('Install packages pynndescent, networkx and python-louvain following \
                    instructions of DigitalCellSorter installation on GitHub. Defaulting to agglomerative clustering method')
                self.clusteringFunction = AgglomerativeClustering

        if type(self.clusteringFunction) is dict:

            import pynndescent
            import networkx as nx
            import community
            
            try:
                k_neighbors = self.clusteringFunction[k_neighbors]
            except Exception as exception:
                if self.verbose >= 2:
                    print(exception)
                k_neighbors = 40

            try:
                metric = self.clusteringFunction[metric]
            except Exception as exception:
                if self.verbose >= 2:
                    print(exception)
                metric = 'euclidean'
            
            try:
                clusterExpression = self.clusteringFunction[clusterExpression]
            except Exception as exception:
                if self.verbose >= 2:
                    print(exception)
                clusterExpression = False

            data = self._df_expr.values.T if clusterExpression else df_xpca.values.T

            if self.verbose >= 2:
                print('Searching for %s nearest neighbors' % (k_neighbors), flush=True)
            knn = pynndescent.NNDescent(data, metric=metric, n_neighbors=k_neighbors).query(data, k=k_neighbors)

            if self.verbose >= 2:
                print('k(=%s) nearest neighbors found. Constructing a NetworkX graph' % (k_neighbors), flush=True)
            A = np.zeros((len(knn[0]),len(knn[0])))
            for i in range(len(knn[0])):
                A[i, knn[0][i]] = knn[1][i]

            G = nx.from_numpy_array(A)

            if self.verbose >= 2:
                print('Clustering the graph', flush=True)
            cellClusterIndex = pd.Series(community.best_partition(G)).sort_index().values
        else:
            data = df_xpca.values

            cellClusterIndex = self.clusteringFunction(n_clusters=self.nClusters).fit(data.T).labels_.astype(float).astype(str)

            if self.verbose >= 2:
                print(np.unique(cellClusterIndex, return_counts=True))

            if self.doFineClustering:
                fineCellClusterIndex = cellClusterIndex.copy()

                clusters = np.unique(cellClusterIndex)

                for cluster in clusters:
                    cellsOfCluster = np.where(cellClusterIndex == cluster)[0]
                    subData = data[:, cellsOfCluster]
                    subData = subData[subData.sum(axis=1) > 0.]
            
                    if len(cellsOfCluster) >= self.minSizeForFineClustering:
                        try:
                            model = SpectralCoclustering(n_clusters=self.nFineClusters, random_state=self.random_state)
                            model.fit(subData.T)
                            tempCellClusterIndex = np.zeros((subData.T.shape[0],))
                            tempCellClusterIndex[:] = np.nan
                            for i, subCluster in enumerate(model.biclusters_[0]):
                                tempCellClusterIndex[np.where(subCluster)[0]] = i
                        except Exception as exception:
                            if self.verbose >= 1:
                                print(exception)
                                print('Exception', subData.shape, cellsOfCluster.shape)
                            tempCellClusterIndex = np.zeros((subData.T.shape[0],))
                    else:
                        if self.verbose >= 2:
                            print('Small cluster', len(cellsOfCluster))
                        tempCellClusterIndex = np.zeros((subData.T.shape[0],))

                    subSizes = np.unique(tempCellClusterIndex, return_counts=True)[1]

                    if (subSizes < self.minSubclusterSize).any():
                        if self.verbose >= 2:
                            print('Fine clusters too small (less than %s)' % (self.minSubclusterSize), subSizes)
                        tempCellClusterIndex = np.zeros((subData.T.shape[0],))
            
                    fineCellClusterIndex[cellsOfCluster] = np.array([str(cluster)[:-1] + label for label in tempCellClusterIndex.astype(int).astype(str)])

                    if self.splitFineClusters:
                        for fineCluster in np.unique(fineCellClusterIndex[cellsOfCluster]):
                            cellsOfFineCluster = cellsOfCluster[fineCellClusterIndex[cellsOfCluster] == fineCluster]
                            subSubData = data[:, cellsOfFineCluster]

                            M = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(subSubData.T, metric='euclidean'))
                            subLabels = np.zeros(M.shape[0]).astype(int)

                            subSubSize = self.subSplitSize
                            subClustersSizes = []
                            tempNum = 0
                            while tempNum < len(subLabels):
                                if tempNum + 2*subSubSize <= len(subLabels):
                                    subClustersSizes.append(subSubSize)
                                    tempNum += subSubSize
                                else:
                                    subClustersSizes.append(len(subLabels) - tempNum)
                                    tempNum += len(subLabels) - tempNum

                            for ic, subClusterSize in enumerate(subClustersSizes):
                                chosenCell = np.nanmean(M, axis=0).argmax()

                                wh = M[chosenCell].argsort()[:subClusterSize-1]
                                M[chosenCell, wh] = np.nan
                                subLabels[wh] = [ic]*len(wh)

                            prevLabels = fineCellClusterIndex[cellsOfFineCluster]
                            addLabels = ['.' + label for label in subLabels.astype(str)]

                            fineCellClusterIndex[cellsOfFineCluster] = np.array([prevLabel + addLabel for prevLabel, addLabel in zip(prevLabels, addLabels)]) 

                cellClusterIndex = fineCellClusterIndex

        if 'cluster' in self._df_expr.columns.names:
            self._df_expr = self._df_expr.droplevel('cluster', axis=1)

        df_clusters = pd.DataFrame(data=cellClusterIndex, index=self._df_expr.columns)
        df_clusters.columns = ['cluster']

        df_clusters.to_hdf(self.fileHDFpath, key='df_clusters', mode='a', complevel=4, complib='zlib')
        self._df_expr = pd.concat([df_clusters, self._df_expr.T], sort=False, axis=1).reset_index().set_index(['batch', 'cell', 'cluster']).T

        return
    
[docs]    def annotate(self, mapNonexpressedCelltypes = True):
        
        '''Produce cluster voting results, annotate cell types, 
        and update marker expression with cell type labels

        Parameters:
            mapNonexpressedCelltypes: boolean, Default True
                If True then cell types coloring will be consistent across all datasets,
                regardless what cell types are annotated in all datasets
                for a given input marker list file.

        Returns:
            dictionary
                Voting results, a dictionary in form of:
                {cluster label: assigned cell type}
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            results = DCS.annotate(df_markers_expr, df_marker_cell_type)
        '''

        self.loadExpressionData()

        if self._df_expr is None:

            return

        self.prepareMarkers(expressedGenes=self._df_expr.index if mapNonexpressedCelltypes else None)

        df_marker_cell_type = pd.read_hdf(self.fileHDFpath, key='df_marker_cell_type')

        df_markers_expr = self._df_expr.loc[self._df_expr.index.intersection(df_marker_cell_type.index)]

        if self.verbose >= 1:
            print('Selected genes common with the marker list. Data shape:', df_markers_expr.shape, flush=True)

        methodsToUse = self.annotationMethod.split('-')

        if 'pDCS' in methodsToUse:
            if self.verbose >= 1:
                print('\nCalculating results by %s method' % ('pDCS'), flush = True)
            annotationResults_pDCS = list(self.annotateWith_pDCS_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))

        if 'ratio' in methodsToUse:
            if self.verbose >= 1:
                print('\nCalculating results by %s method' % ('ratio'), flush = True)
            annotationResults_ratio = list(self.annotateWith_ratio_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))

        if 'Hopfield' in methodsToUse:
            if self.verbose >= 1:
                print('\nCalculating results by %s method' % ('Hopfield'), flush = True)
            annotationResults_Hopfield = list(self.annotateWith_Hopfield_Scheme(df_markers_expr.copy(), df_marker_cell_type.T.copy()))

        if ('pDCS' in methodsToUse) and ('ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
            annotationResults = annotationResults_pDCS
            annotationResults[2] *= annotationResults_ratio[2]
            annotationResults[2] *= annotationResults_Hopfield[2]
            annotationResults[2] = np.power(annotationResults[2], 1./3.)
            annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.

        elif ('pDCS' in methodsToUse) and ('ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
            annotationResults = annotationResults_pDCS
            annotationResults[2] *= annotationResults_ratio[2]
            annotationResults[2] = np.power(annotationResults[2], 1./2.)
            annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.

        elif ('pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
            annotationResults = annotationResults_pDCS
            annotationResults[2] *= annotationResults_Hopfield[2]
            annotationResults[2] = np.power(annotationResults[2], 1./2.)
            annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.

        elif (not 'pDCS' in methodsToUse) and ('ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
            annotationResults = annotationResults_ratio
            annotationResults[2] *= annotationResults_Hopfield[2]
            annotationResults[2] = np.power(annotationResults[2], 1./2.)
            annotationResults[2][annotationResults[2] < self.thresholdForUnknown] = 0.

        elif ('pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
            annotationResults = annotationResults_pDCS

        elif (not 'pDCS' in methodsToUse) and ('ratio' in methodsToUse) and (not 'Hopfield' in methodsToUse):
            annotationResults = annotationResults_ratio

        elif (not 'pDCS' in methodsToUse) and (not 'ratio' in methodsToUse) and ('Hopfield' in methodsToUse):
            annotationResults = annotationResults_Hopfield

        else:
            if self.verbose >= 1:
                print('Annotation method %s not supported' % (self.annotationMethod), flush=True)

            return

        annotationResults = self.recordAnnotationResults(*tuple(annotationResults))

        if self.verbose >= 1:
            print(annotationResults, flush=True)
            print('Updating the expression data with annotation results', flush=True)

        df_markers_expr = df_markers_expr.T.reset_index().T

        df_markers_expr.loc['label'] = np.array([annotationResults[i] for i in df_markers_expr.loc['cluster']])

        df_markers_expr = df_markers_expr.T.set_index(['batch', 'cell', 'cluster', 'label']).T.apply(pd.to_numeric)

        df_markers_expr.to_hdf(self.fileHDFpath, key='df_markers_expr', mode='a', complevel=4, complib='zlib')
        
        return annotationResults
    
[docs]    def process(self, dataIsNormalized = False, cleanData = True):

        '''Process data before using any annotation of visualization functions

        Parameters:
            dataIsNormalized: boolean, Default False
                Whether DCS.df_expr is normalized or not

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()
        '''

        # Convert index to hugo names, clean data
        if not dataIsNormalized:
            self.convert()
        
        if cleanData:
            self.clean()
        
        # Calculate QC measures
        if self.doQualityControl:
            self.calculateQCmeasures()

        # Normalize and then correct for batch effects
        if not dataIsNormalized:
            self.normalize(median=self.medianScaleFactor)

        if self.toggleDoBatchCorrection:
            self.batchEffectCorrection()

        # Calculate PCA and 2D projection
        self.project()

        # Remove low quality cells
        if self.doQualityControl:
            self.qualityControl()

        # Cluster data, append cluster index to expression DataFrame
        self.cluster()

        # Compress and record DataFrame to dense matrix
        self.recordExpressionData()
        
        return

[docs]    def visualize(self):

        '''Aggregate of visualization tools of this class.

        Parameters:
            None

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.visualize()
        '''

        ##############################################################################################
        # Plot voting results matrix
        ##############################################################################################
        if self.toggleMakeVotingResultsMatrixPlot:
            if self.verbose >= 2:
                print('Making voting results matrix plot', flush=True)
            self.makeAnnotationResultsMatrixPlot()

        ##############################################################################################
        # Plot null distributions
        ##############################################################################################
        if self.toggleMakeHistogramNullDistributionPlot:
            if self.verbose >= 2:
                print('Making null distributions plot', flush=True)
            self.makeHistogramNullDistributionPlot()
        
        ##############################################################################################
        # Plot mean marker expression
        ##############################################################################################
        if self.toggleMakeMarkerExpressionPlot:
            if self.verbose >= 2:
                print('Making marker expression plot', flush=True)
            self.makeMarkerExpressionPlot()

        ##############################################################################################
        # Make 2D projection plots
        ##############################################################################################
        if self.toggleMakeProjectionPlotsQC and self.doQualityControl:
            self.makeProjectionPlotsQualityControl()

        if self.toggleMakeProjectionPlotClusters:
            self.makeProjectionPlotByClusters()

        if self.toggleMakeProjectionPlotBatches:
            self.makeProjectionPlotByBatches()

        if self.toggleMakeProjectionPlotAnnotatedClusters:
            self.makeProjectionPlotAnnotated()

        if self.toggleAnomalyScoresProjectionPlot:
            self.makeAnomalyScoresPlot()
           
        ##############################################################################################
        # Make stacked barplot of cell type fractions
        ##############################################################################################
        if self.toggleMakeStackedBarplot:        
            self.makeStackedBarplot(self.subclusteringName)
        
        ##############################################################################################
        # Make projection plots showing relative expression of different markers (one for each
        # marker)
        ##############################################################################################
        if self.toggleGetMarkerSubplots:
            self.makeMarkerSubplots()

        return None


    # Plotting user functions of class #################################################
[docs]    def makeProjectionPlotAnnotated(self, **kwargs):

        '''Produce projection plot colored by cell types
              
        Parameters:
            Any parameters that function 'makeProjectionPlot' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.makeProjectionPlotAnnotated()
        '''

        if self.verbose >= 3:
            print('Making projection plot by clusters with annotated labels')

        df_markers_expr = pd.read_hdf(self.fileHDFpath, key='df_markers_expr')

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
        df_projection = df_projection[df_markers_expr.groupby(level=['batch', 'cell'], sort=False, axis=1).count().columns]

        with open(os.path.join(self.saveDir, 'ColormapForCellTypes.txt'), 'r') as temp_file:
            colormap = {item.strip().split('\t')[0]:eval(item.strip().split('\t')[1]) for item in temp_file.readlines()}

        kwargs.setdefault('suffix', 'by_clusters_annotated')
        kwargs.setdefault('colormap', colormap)
        kwargs.setdefault('legend', False)

        fig = self.makeProjectionPlot(df_projection.values, df_markers_expr.columns.get_level_values('label'), **kwargs)

        return fig
    
[docs]    def makeProjectionPlotByBatches(self, **kwargs):

        '''Produce projection plot colored by batches
              
        Parameters:
            Any parameters that function 'makeProjectionPlot' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.makeProjectionPlotByBatches()
        '''

        if self.verbose >= 3:
            print('Making projection plot by batches')

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')

        batches = df_projection.columns.get_level_values('batch').values

        #wh1 = np.where(batches == np.unique(batches)[0])
        #wh2 = np.where(batches == np.unique(batches)[1])
        #wh3 = np.where(batches == np.unique(batches)[2])
        #batches[wh1] = 'BM1'
        #batches[wh2] = 'BM2'
        #batches[wh3] = 'BM3'

        kwargs.setdefault('legend', True)
        kwargs.setdefault('labels', False)
        kwargs.setdefault('suffix', 'by_patients')

        fig = self.makeProjectionPlot(df_projection.values, batches, **kwargs)

        return fig
    
[docs]    def makeProjectionPlotByClusters(self, **kwargs):

        '''Produce projection plot colored by clusters
              
        Parameters:
            Any parameters that function 'makeProjectionPlot' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.makeProjectionPlotByClusters()
        '''

        if self.verbose >= 3:
            print('Making projection plot by clusters')

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
        df_clusters = pd.read_hdf(self.fileHDFpath, key='df_clusters').reindex(df_projection.columns)

        kwargs.setdefault('suffix', 'by_clusters')

        labels = np.array(['Cluster #%s' % (label[0]) if label==label else label for label in df_clusters.values])

        fig = self.makeProjectionPlot(df_projection.values, labels, **kwargs)

        return fig

[docs]    def makeProjectionPlotsQualityControl(self, **kwargs):

        '''Produce Quality Control projection plots
              
        Parameters:
            Any parameters that function 'makeProjectionPlot' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.makeProjectionPlotsQualityControl()
        '''

        def reduce(v, size = 100):

            bins =  np.linspace(np.min(v), np.max(v), num=size)

            return bins[np.digitize(v, bins) - 1]

        if self.verbose >= 3:
            print('Making projection plots of QC', flush=True)

        df_QC = pd.read_hdf(self.fileHDFpath, key='QC')
        if self.toggleRemoveLowQualityCells:
            df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC')
        else:
            df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')

        goodQUalityCells = pd.read_hdf(self.fileHDFpath, key='df_projection').columns

        kwargs.setdefault('legend', False)
        kwargs.setdefault('labels', False)
        kwargs.setdefault('colorbar', True)

        kwargs.update(suffix='by_number_of_genes')
        self.makeProjectionPlot(df_projection.values, reduce(df_QC['number_of_genes'].values), **kwargs)

        kwargs.update(suffix='by_count_depth')
        self.makeProjectionPlot(df_projection.values, reduce(df_QC['count_depth'].values), **kwargs)
        
        kwargs.update(suffix='by_fraction_of_mitochondrialGenes')
        self.makeProjectionPlot(df_projection.values, reduce(df_QC['fraction_of_mitochondrialGenes'].values), **kwargs)

        kwargs.update(suffix='by_is_quality_cell')
        kwargs.update(colorbar=False)

        self.makeProjectionPlot(df_projection.values, df_projection.columns.isin(goodQUalityCells.get_level_values('cell'), level='cell'), **kwargs)

        return

[docs]    def makeMarkerSubplots(self, **kwargs):

        '''Produce subplots on each marker and its expression on all clusters
              
        Parameters:
            Any parameters that function 'internalMakeMarkerSubplots' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.makeMarkerSubplots()
        '''

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
        df_markers_expr = pd.read_hdf(self.fileHDFpath, key='df_markers_expr')
        df_projection = df_projection[pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'), df_markers_expr.columns.get_level_values('cell')])]

        hugo_cd_dict = dict(zip(df_markers_expr.index.values.tolist(), self.gnc.Convert(list(df_markers_expr.index), 'hugo', 'alias', returnUnknownString=False)))
        
        kwargs.setdefault('analyzeBy', 'label')
        kwargs.setdefault('NoFrameOnFigures', True)
        kwargs.setdefault('HideClusterLabels', False)
        kwargs.setdefault('outlineClusters', True)

        self.internalMakeMarkerSubplots(df_markers_expr, df_projection.values, hugo_cd_dict, **kwargs)

        return None

[docs]    def makeAnomalyScoresPlot(self, cells = 'All', suffix = '', noPlot = False, **kwargs):

        '''Make anomaly scores plot

        Parameters:
            cells: pandas.MultiIndex, Default 'All'
                Index of cells of interest

            Any parameters that function 'makeProjectionPlot' can accept

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            cells = DCS.getCells(celltype='T cell')

            DCS.makeAnomalyScoresPlot(cells)
        '''

        if self._df_expr is None:
            self.loadExpressionData()

        if type(cells) is str:
            if cells == 'All':
                cells = pd.MultiIndex.from_arrays([self._df_expr.columns.get_level_values('batch'), 
                                                   self._df_expr.columns.get_level_values('cell') ], names=['batch', 'cell']) 
            else:
                if self.verbose >= 1:
                    print('"cells" value "%s" is unknown' % (cells))
                raise ValueError

        df_sel = self.getExprOfCells(cells)

        df_sel.columns = pd.MultiIndex.from_arrays([df_sel.columns.get_level_values('batch'), 
                                                    df_sel.columns.get_level_values('cell') ], names=['batch', 'cell'])

        scores = self.getAnomalyScores(df_sel, df_sel)
        scores = pd.DataFrame(index=df_sel.columns, data=scores)
        scores.columns = ['score']
        scores.sort_values(by='score', ascending=False).to_excel(os.path.join(self.saveDir, 'anomalyScores%s.xlsx' % (suffix)), merge_cells=False)
        
        if noPlot:

            return

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')
        scores = scores.reindex(df_projection.columns).values.T[0]

        kwargs.setdefault('suffix', 'by_anomaly_score')
        kwargs.setdefault('legend', False)
        kwargs.setdefault('labels', False)
        kwargs.setdefault('colorbar', True)

        fig = self.makeProjectionPlot(df_projection.values, scores, **kwargs)

        return fig
    
[docs]    def makeIndividualGeneTtestPlot(self, gene, analyzeBy = 'label', **kwargs):

        '''Produce individual gene t-test plot of the two-tailed p-value.

        Parameters:
            gene: str
                Name of gene of interest

            analyzeBy: str, Default 'label'
                What level of lablels to include.
                Other possible options are 'label' and 'celltype'

            Any parameters that function 'makeTtestPlot' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.makeIndividualGeneTtestPlot('SDC1')
        '''

        df_markers_expr = self.getExprOfGene(gene, analyzeBy=analyzeBy)

        if df_markers_expr is None:

            return None

        groups = np.unique(df_markers_expr.columns.get_level_values(analyzeBy).values)

        ttestStatistic = pd.DataFrame(index=groups, columns=groups)
        ttestpValue = pd.DataFrame(index=groups, columns=groups)
        for groupA in groups:
            for groupB in groups:
                A = df_markers_expr.xs(key=groupA, level=analyzeBy, axis=1).values.squeeze()
                B = df_markers_expr.xs(key=groupB, level=analyzeBy, axis=1).values.squeeze()
                ttestStatistic.loc[groupA, groupB], ttestpValue.loc[groupA, groupB] = scipy.stats.ttest_ind(A[np.where(A != 0)], B[np.where(B != 0)])

        alt = self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]
        alt = [alt] if type(alt) is str else alt

        kwargs.setdefault('label', '%s\n(%s)' % (gene, ('\n').join(list(alt))))

        fig = self.makeTtestPlot(ttestStatistic, ttestpValue, **kwargs)

        return fig    

[docs]    def makeIndividualGeneExpressionPlot(self, genes, **kwargs):

        '''Produce individual gene expression plot on a 2D layout

        Parameters:
            gene: str, or list-like
                Name of gene of interest. E.g. 'CD4, CD33', 'PECAM1', ['CD4', 'CD33']

            hideClusterLabels: boolean, Default False
                Whether to hide the clusters labels

            outlineClusters: boolean, Default True
                Whether to outline the clusters with circles

            Any parameters that function 'internalMakeMarkerSubplots' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.makeIndividualGeneExpressionPlot('CD4')
        '''

        kwargs.setdefault('analyzeBy', 'cluster')
        kwargs.setdefault('NoFrameOnFigures', True)
        kwargs.setdefault('HideClusterLabels', False)
        kwargs.setdefault('outlineClusters', True)

        df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection')

        if type(genes) is str:
            genes = genes.replace(' ', '').split(',')

        for gene in genes:
            df_markers_expr = self.getExprOfGene(gene, analyzeBy=kwargs['analyzeBy'])

            if not df_markers_expr is None:
                hugo_cd_dict = {gene: self.gnc.Convert([gene], 'hugo', 'alias', returnUnknownString=False)[0]}

                cells = pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'),
                                                   df_markers_expr.columns.get_level_values('cell')], 
                                                  names=['batch', 'cell'])

                self.internalMakeMarkerSubplots(df_markers_expr, 
                                                df_projection.reindex(cells, axis=1).values, 
                                                hugo_cd_dict, **kwargs)

        return None

[docs]    def makeHopfieldLandscapePlot(self, meshSamplingRate = 1000, plot3D = True, reuseData = False, **kwargs):

        '''Make and plot Hopfield landscape

        Parameters:
            meshSamplingRate: int, Default 1000
                Defines quality of sampling around attractor states
    
            plot3D: boolean, Default False
                Whether to plot 2D or 3D figure

            reuseData: boolean, Default False
                Whether to attempt using precalculated data.

            Any parameters that function 'HopfieldLandscapePlot' or 'HopfieldLandscapePlot3D' can accept

        Returns:
            None
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()
            DCS.makeHopfieldLandscapePlot()
        '''

        dataPath = os.path.join(self.saveDir, 'Hopfiled_mode_1')

        if reuseData and (not os.path.exists(dataPath)):
            reuseData = False

        if not reuseData:
            df_attrs = self.readMarkerFile()

            self.propagateHopfield(xi=df_attrs.values, typesNames=df_attrs.columns.values, path=dataPath, meshSamplingRate=meshSamplingRate, mode=1)

        if plot3D:
            fig = self.HopfieldLandscapePlot3D(trPath=dataPath, **kwargs)
        else:
            fig = self.HopfieldLandscapePlot(trPath=dataPath, colorbarva=0.1, colorbarha=0.82, fontsize=17, **kwargs)

        return fig

    
    # Extraction user functions of class ###############################################
[docs]    def getAnomalyScores(self, trainingSet, testingSet, printResults = False):

        '''Function to get anomaly score of cells based on some reference set

        Parameters:
            trainingSet: pandas.DataFrame
                With cells to trail isolation forest on

            testingSet: pandas.DataFrame
                With cells to score

            printResults: boolean, Default False
                Whether to print results

        Returns:
            1d numpy.array
                Anomaly score(s) of tested cell(s)
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            cutoff = DCS.getAnomalyScores(df_expr.iloc[:, 5:], df_expr.iloc[:, :5])
        '''

        if self.verbose >= 1:
            print('Calculating anomaly scores of the selected cells', flush=True)

        instanceIsolationForest = IsolationForest(n_jobs=self.availableCPUsCount, max_samples=np.min([trainingSet.shape[1] - 1, 256]))

        if self.verbose >= 2:
            print('Training isolation forest')
        instanceIsolationForest.fit(trainingSet.values.T)

        if type(testingSet) is pd.Series:
            testingSet = testingSet.to_frame()

        if self.verbose >= 2:
            print('Scoring samples')
        scores = instanceIsolationForest.score_samples(testingSet.values.T)

        scores *= -1.

        results = list(zip(testingSet.columns, scores))

        if printResults:
            for cell in results:
                print('Cell:', cell[0], 'Score:', cell[1])

        return scores

[docs]    def getHugoName(self, gene, printAliases = False):
        
        '''Get gene hugo name(s).

        Parameters:
            gene: str
                'hugo' or 'alias' name of a gene

        Returns:
            str
                Hugo name if found, otherwise input name

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.getHugoName('CD138')
        '''

        return self.gnc.Convert(gene, 'alias', 'hugo', returnUnknownString=False)

[docs]    def getExprOfGene(self, gene, analyzeBy = 'cluster'):

        '''Get expression of a gene.
        Run this function only after function process()

        Parameters:
            cells: pandas.MultiIndex
                Index of cells of interest

            analyzeBy: str, Default 'cluster'
                What level of lablels to include.
                Other possible options are 'label' and 'celltype'

        Returns:
            pandas.DataFrame
                With expression of the cells of interest

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.getExprOfGene('SDC1')
        '''

        fileName = self.fileHDFpath

        if not os.path.isfile(fileName):
            if self.verbose >= 1:
                print('Processed data file not found')
            return
        
        try:
            df_markers_expr = pd.read_hdf(fileName, key='df_expr').xs(key=gene, axis=0, level=-1).T
        except Exception as exception:
            if self.verbose >= 2:
                print(exception)
                print('Gene %s not in index' % (gene))
            return

        df_markers_expr.index = [gene]

        df_markers_expr.columns = pd.MultiIndex.from_arrays([df_markers_expr.columns.get_level_values('batch'), df_markers_expr.columns.get_level_values('cell')])

        labelled = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').columns
        
        columns = pd.MultiIndex.from_arrays([labelled.get_level_values('batch'), labelled.get_level_values('cell')])
        df_markers_expr = df_markers_expr.reindex(columns, axis=1).fillna(0.)

        if analyzeBy == 'celltype':
            analyzeBy = 'label'
            columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index

            df_markers_expr.columns = pd.MultiIndex.from_arrays([columns.get_level_values('batch'), 
                                                                 columns.get_level_values('cell'), 
                                                                 columns.get_level_values(analyzeBy).str.split(' #', expand=True).get_level_values(0)], names=['batch', 'cell','celltype'])
        else:
            df_markers_expr.columns = labelled.to_series().reset_index().set_index(['batch', 'cell'])[analyzeBy].loc[df_markers_expr.columns].reset_index().set_index(['batch', 'cell', analyzeBy]).index

        return df_markers_expr
    
[docs]    def getExprOfCells(self, cells):

        '''Get expression of a set of cells.
        Run this function only after function process()

        Parameters:
            cells: pandas.MultiIndex
                2-level Index of cells of interest, must include levels 'batch' and 'cell'

        Returns:
            pandas.DataFrame
                With expression of the cells of interest

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            DCS.getExprOfCells(cells)
        '''

        if self._df_expr is None:
            self.loadExpressionData()

        index = pd.MultiIndex.from_arrays([self._df_expr.columns.get_level_values('batch'), 
                                   self._df_expr.columns.get_level_values('cell') ], names=['batch', 'cell'])

        if cells.reindex(index)[1] is None:
            if self.verbose >= 2:
                print('Selected expression of all cells')

            return self.df_expr

        return self._df_expr[self._df_expr.columns[cells.reindex(index)[1] > -1]]

[docs]    def getCells(self, celltype = None, clusterIndex = None, clusterName = None):

        '''Get cell annotations in a form of pandas.Series

        Parameters:
            celltype: str, Default None
                Cell type to extract

            clusterIndex: int, Default None
                Cell type to extract

            clusterName: str, Default None
                Cell type to extract

        Returns:
            pandas.MultiIndex
                Index of labelled cells
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.process()

            labels = DCS.getCells()
        '''

        df = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').columns.to_frame()

        if not clusterIndex is None:

            se = df.set_index(['batch', 'cell'])['cluster']

            condition = se == clusterIndex

            if not condition.any():
                if self.verbose >= 1:
                    print('No cells found')
                return None

            selectedCells = se[condition].index

            if self.verbose >= 2:
                print('Selected %s cells from cluster: %s' % (len(selectedCells), clusterIndex))

            return selectedCells

        if not clusterName is None:

            se = df.set_index(['batch', 'cell'])['cluster']

            condition = se == eval(clusterName.split('#')[1])

            if not condition.any():
                if self.verbose >= 1:
                    print('No cells found')
                return None

            selectedCells = se[condition].index

            if self.verbose >= 2:
                print('Selected %s cells from cluster: %s' % (len(selectedCells), clusterName))

            return selectedCells

        se = df.set_index(['batch', 'cell'])['label']

        if celltype is None and clusterIndex is None:

            if self.verbose >= 1:
                print('Available cell types:', np.unique(se.str.split(' #', expand=True)[0].values))
                print('Note: To get a specific cell type call this function with a specified cell type')

            return se

        condition = se.str.find(celltype) != -1

        if not condition.any():
            if self.verbose >= 1:
                print('No cells found')
            return None

        selectedCells = se[condition].index

        if self.verbose >= 2:
            print('Selected %s cells of type: %s' % (len(selectedCells), celltype))

        return selectedCells

[docs]    def getIndexOfGoodQualityCells(self, QCplotsSubDir = 'QC_plots', **kwargs):

        '''Get index of sells that satisfy the QC criteria

        Parameters:
            count_depth_cutoff: float, Default 0.5
                Fraction of median to take as count depth cutoff

            number_of_genes_cutoff: float, Default 0.5
                Fraction of median to take as number of genes cutoff

            mitochondrial_genes_cutoff: float, Default 3.0
                The cutoff is median + standard_deviation * this_parameter

            Any parameters that function 'makeQualityControlHistogramPlot' can accept

        Returns:
            pandas.Index
                Index of cells
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            index = DCS.getIndexOfGoodQualityCells()
        '''

        df_QC = pd.read_hdf(self.fileHDFpath, key='QC')

        if not self.excludedFromQC is None:
            df_QC = df_QC.loc[df_QC.index.difference(self.excludedFromQC)]

        plotsDir = os.path.join(self.saveDir, QCplotsSubDir, '')

        kwargs.setdefault('MakeHistogramPlot', True)

        # Calculate cutoffs
        cutoff_count_depth = self.getQualityControlCutoff(df_QC['count_depth'], self.countDepthCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_count_depth' % (self.dataName)), precut=self.countDepthPrecutQC, **kwargs)
        cutoff_number_of_genes = self.getQualityControlCutoff(df_QC['number_of_genes'], self.numberOfGenesCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_number_of_genes' % (self.dataName)), precut=self.numberOfGenesPrecutQC, **kwargs)
        cutoff_fraction_of_mitochondrialGenes = self.getQualityControlCutoff(df_QC['fraction_of_mitochondrialGenes'], self.mitochondrialGenesCutoffQC, plotPathAndName=os.path.join(plotsDir, '%s_fraction_of_mitochondrialGenes' % (self.dataName)), mito=True, **kwargs)

        df_QC['isGoodQuality'] = np.zeros(len(df_QC)).astype(bool)

        mask = (df_QC['count_depth'] > cutoff_count_depth) & \
            (df_QC['number_of_genes'] > cutoff_number_of_genes) & \
            (df_QC['fraction_of_mitochondrialGenes'] < cutoff_fraction_of_mitochondrialGenes)

        index = df_QC['isGoodQuality'].index[mask]

        if not self.excludedFromQC is None:
            index = pd.concat([pd.Series(index=index), pd.Series(index=self.excludedFromQC)]).index

        return index.sort_values()

[docs]    def getQualityControlCutoff(self, se, cutoff, precut = 1., mito = False, MakeHistogramPlot = True, **kwargs):

        '''Function to calculate QC quality cutoff

        Parameters:
            se: pandas.Series 
                With data to analyze

            cutoff: float
                Parameter for calculating the quality control cutoff

            mito: boolean, Default False
                Whether the analysis of mitochondrial genes fraction

            plotPathAndName: str, Default None
                Text to include in the figure title and file name

            MakeHistogramPlot: boolean, Default True
                Whether to make a histogram plot

            Any parameters that function 'makeQualityControlHistogramPlot' can accept

        Returns:
            float
                Cutoff value
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            cutoff = DCS.getQualityControlCutoff(se)
        '''

        subset = se[~np.isnan(se.values)].values

        if mito:
            median = np.round(np.median(subset),4)
            std = np.round(np.std(subset),4)

            hist_of_subset = scipy.stats.rv_histogram(np.histogram(subset, bins=100, range=(0, 0.3)))
            hist_data = hist_of_subset._hpdf / 100
            hist_bins = hist_of_subset._hbins

            xs = np.linspace(hist_bins[0], hist_bins[-1], 1000)
            spline_data = np.vstack((xs, UnivariateSpline(hist_bins, hist_data[:-1], k=5, s=0)(xs))).T
            smoothed = scipy.signal.savgol_filter(spline_data.T[1], 101, 3)

            x1 = spline_data.T[0][np.where(spline_data.T[0] > median)[0][0]]
            y1 = smoothed[np.where(spline_data.T[0] > median)[0][0]]

            wh = np.where(spline_data.T[0] > (median + cutoff * std))[0][0]

            x2 = spline_data.T[0][wh]
            y2 = smoothed[wh]

            cutoff = min(median + 3. * std, x1 - y1 * (x2 - x1) / (y2 - y1))
            cutoff = max(median + 1. * std, cutoff)
        else:
            if self.precutQC:
                cutoff = int(cutoff * np.median(subset[subset > precut]))
                cutoff = min(cutoff, 2 * precut)
            else:
                cutoff = int(cutoff * np.median(subset))

        kwargs.setdefault('mito', mito)

        if MakeHistogramPlot:
            self.makeQualityControlHistogramPlot(subset, cutoff, **kwargs)
            
        return cutoff

[docs]    def getCountsDataframe(self, se1, se2, tagForMissing = 'N/A'):

        '''Get a pandas.DataFrame with cross-counts (overlaps) between two pandas.Series

        Parameters:
            se1: pandas.Series
                Series with the first set of items

            se2: pandas.Series 
                Series with the second set of items

            tagForMissing: str, Default 'N/A'
                Label to assign to non-overlapping items

        Returns:
            pandas.DataFrame
                Contains counts
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            df = DCS.getCountsDataframe(se1, se2)
        '''

        df = self.alignSeries(se1, se2, tagForMissing)

        counts = {group[0]:{k:len(v) for k, v in group[1].groupby(by='se1').groups.items()} for group in df.reset_index(drop=True).set_index('se2').groupby('se2')}

        df = pd.DataFrame.from_dict(counts).fillna(0.0).astype(int)

        moveTag = lambda df: pd.concat([df.iloc[np.where(df.index != tagForMissing)[0]], df.iloc[np.where(df.index == tagForMissing)[0]]], axis=0, sort=False) if tagForMissing in df.index else df

        return moveTag(moveTag(df.T).T)
    
[docs]    def getNewMarkerGenes(self, cluster = None, top = 100, zScoreCutoff = None, removeUnknown = False, **kwargs):

        '''Extract new marker genes based on the cluster annotations

        Parameters:
            cluster: int, Default None
                Cluster #, if provided genes of only this culster will be returned

            top: int, Default 100
                Upper bound for number of new markers per cell type

            zScoreCutoff: float, Default 0.3
                Lower bound for a marker z-score to be significant

            removeUnknown: boolean, Default False
                Whether to remove type "Unknown"

            Any parameters that function 'makePlotOfNewMarkers' can accept

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.extractNewMarkerGenes()
        '''

        if zScoreCutoff is None:

            zScoreCutoff = self.zScoreCutoff

        if self._df_expr is None:

            self.loadExpressionData()

        if self._df_expr is None:

            return

        df_gene_cluster_centroids = self._df_expr.groupby(level=['cluster'], sort=True, axis=1).mean()

        df_gene_cluster_centroids_merged = pd.DataFrame()
        df_votingResults = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_annotation.xlsx'), sheet_name='z-scores', index_col='cluster')[['# cells in cluster', 'Predicted cell type']]

        groups = pd.concat([df_votingResults['# cells in cluster'], df_votingResults['Predicted cell type'].str.split(' #', expand=True)[0]], sort=False, axis=1).reset_index().set_index(0).groupby(by=0)

        for group in groups:
            se_cluster_size = group[1].reset_index().set_index('cluster')['# cells in cluster']
            se_type = (df_gene_cluster_centroids[se_cluster_size.index] * se_cluster_size.values).sum(axis=1) / se_cluster_size.values.sum()
            se_type.name = group[0]
            df_gene_cluster_centroids_merged = pd.concat([df_gene_cluster_centroids_merged, se_type], sort=False, axis=1)

        df_gene_cluster_centroids_merged = df_gene_cluster_centroids_merged.apply(self.zScoreOfSeries, axis=1)

        if not cluster is None:

            clusterGenes = df_gene_cluster_centroids[cluster].sort_values(ascending=False)
            clusterGenes = clusterGenes[clusterGenes >= zScoreCutoff].iloc[:top]

            return {'genes':clusterGenes.index.values.tolist(), 'zscore':clusterGenes.values.tolist()}

        if removeUnknown and self.nameForUnknown in df_gene_cluster_centroids_merged.columns:
            df_gene_cluster_centroids_merged.drop(columns=[self.nameForUnknown], inplace=True)

        df_new_marker_genes = df_gene_cluster_centroids_merged.T

        df_new_marker_genes[df_new_marker_genes < 0.] = 0.

        if self.verbose >= 2:
            print('New marker cell type shape:', df_new_marker_genes.shape)

        df_new_marker_genes = df_new_marker_genes.T.loc[df_new_marker_genes.max(axis=0) >= zScoreCutoff].T

        if self.verbose >= 2:
            print('New marker cell type shape:', df_new_marker_genes.shape)

        df_new_marker_genes = df_new_marker_genes.loc[df_new_marker_genes.max(axis=1) >= zScoreCutoff]

        if self.verbose >= 2:
            print('New marker cell type shape:', df_new_marker_genes.shape)

        df_new_marker_list = pd.DataFrame()
        for i, celltype in enumerate(df_new_marker_genes.index.values):
            all = df_new_marker_genes.loc[celltype]
            sel = all[all > 1.0].sort_values()[:top]

            if self.verbose >= 3:
                print('Candidates of %s:' % (celltype), len(sel))
            df_new_marker_list = pd.concat([df_new_marker_list, sel], sort=False, axis=1)

        df_new_marker_list = df_new_marker_list.fillna(0.).sort_index()
        df_new_marker_list[df_new_marker_list > 0.] = 1.0

        if self.nameForUnknown in df_new_marker_list.columns:
            df_new_marker_list.drop(columns=[self.nameForUnknown], inplace=True)

        df_new_marker_list.index.name = 'Marker'
        fileName = os.path.join(self.saveDir, 'new_markers.xlsx')

        if self.verbose >= 3:
            print('Recording results to:', fileName)

        writer = pd.ExcelWriter(fileName)
        df_new_marker_list.to_excel(writer, 'MarkerCellType')
        df_temp = pd.Series(data=df_new_marker_list.columns, index=df_new_marker_list.columns)
        df_temp.name = 'CellTypeGrouped'
        df_temp.index.name = 'CellType'
        df_temp.to_excel(writer, 'CellTypesGrouped')
        writer.save()

        df_marker_cell_type = pd.read_excel(os.path.join(self.saveDir, self.dataName + '_annotation.xlsx'), 
                                   sheet_name='Marker cell type weight matrix', index_col=0).T

        df_new_marker_genes = df_new_marker_genes[df_new_marker_list.index]

        self.makePlotOfNewMarkers(df_marker_cell_type, df_new_marker_genes, **kwargs)

        return None

    
    # Annotation functions of class ####################################################
[docs]    @classmethod
    def calculateV(cls, args):

        '''Calculate the voting scores (celltypes by clusters)
    
        Parameters:
            args: tuple
                Tuple of sub-arguments

                df_M: pandas.DataFrame
                    Marker cell type DataFrame

                df_X: pandas.DataFrame
                    Markers expression DataFrame

                cluster_index: 1d numpy.array
                    Clustering index

                cutoff: float
                    Significance cutoff, i.e. a threshold for a given marker to be significant

                giveSignificant: boolean
                    Whether to return the significance matrix along with the scores

                removeLowQCscores: boolean
                    Whether to remove low quality scores, 
                    i.e. those with less than 10% of markers that a re supporting

        Returns:
            pandas.DataFrame 
                Contains voting scores per celltype per cluster 

        Usage:
            Function is used internally.

            df = calculateV((df_M, df_X, cluster_index, 0.3, False, True))
        '''

        df_M, df_X, cluster_index, cutoff, giveSignificant, removeLowQCscores = args

        df_Y = pd.DataFrame(data=df_X.values, index=df_X.index, columns=cluster_index).groupby(level=0, sort=True, axis=1).mean()

        if True:
            df_Z = (df_Y.copy() - np.mean(df_Y.values, axis=1)[:, None]) / np.std(df_Y.values, axis=1)[:, None]
            df_Z = 1. * (df_Z > cutoff)
        else:
            df_Z = df_Y > ((1. + cutoff) * np.mean(df_Y.values, axis=1)[:, None])

        df_V = pd.Series(df_M.dot(df_Z).stack().fillna(0.))

        # Remove scores with small number of supporting markers
        if removeLowQCscores:
            df_Q = (1. * (df_M > 0.)).dot(1. * (df_Z > 0.)) < (0.1 * (df_M > 0.).sum(axis=1)[:, None]).astype(int)
            #df_Q = (1. * (df_M > 0.)).dot(1. * (df_Z > 0.)) < (0.1 * (1. * (df_Z > 0.)).sum(axis=0)[None, :]).astype(int)
            df_V[pd.Series(df_Q.stack().fillna(0.))] = 0.

        if giveSignificant:
            return df_V, df_Z

        return df_V

[docs]    def annotateWith_pDCS_Scheme(self, df_markers_expr, df_marker_cell_type):
        
        '''Produce cluster annotation results

        Parameters:
            df_markers_expr: pandas.DataFrame 
                Data with marker genes by cells expression

            df_marker_cell_type: pandas.DataFrame 
                Data with marker genes by cell types

        Returns:
            tuple
        
        Usage:
            Function should be called internally only
        '''

        df_positive = df_marker_cell_type.copy()
        df_negative = df_marker_cell_type.copy()
        df_positive[df_marker_cell_type < .0] = 0.
        df_negative[df_marker_cell_type > .0] = 0.
        df_positive /= (df_positive > 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
        df_negative /= (df_negative < 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
        df_marker_cell_type = df_positive + df_negative

        if self.verbose >= 2:
            print(df_marker_cell_type.shape)

        # Align df_markers_expr and df_marker_cell_type
        df_markers_expr.sort_index(inplace=True, axis=0)
        df_marker_cell_type.sort_index(inplace=True, axis=1)

        # Calculate score (Vkc) for the best clustering
        temp = self.calculateV((df_marker_cell_type, 
                                df_markers_expr, 
                                df_markers_expr.columns.get_level_values('cluster').values, 
                                self.zScoreCutoff, True, self.removeLowQualityScores))
        df_V = temp[0].unstack()
        df_V.index.name = None
        df_Z_best = temp[1]

        dict_expressed_markers = {cluster:df_Z_best.index[df_Z_best[cluster] > 0].values for cluster in df_Z_best}

        np.random.seed(0)

        df_markers_expr_copy = df_markers_expr.copy()

        uclusters = np.unique(df_markers_expr_copy.columns.get_level_values(2).astype(str))
        uindex = pd.Series(df_markers_expr_copy.columns.get_level_values(2).astype(str)).replace(dict(zip(uclusters, range(len(uclusters))))).values
        df_markers_expr_copy.columns = pd.MultiIndex.from_arrays([df_markers_expr_copy.columns.get_level_values(0),
                                                                    df_markers_expr_copy.columns.get_level_values(1),
                                                                    uindex], names=df_markers_expr_copy.columns.names)

        batch_size = self.availableCPUsCount * 80

        if self.nSamples_pDCS < batch_size:
            self.nSamples_pDCS = batch_size

            if self.verbose >= 2:
                print('Distribution size too small. Increasing it to minimum batch size: %s' % (self.nSamples_pDCS))

        N_batches = int(self.nSamples_pDCS / batch_size)

        if self.nSamples_pDCS != batch_size * N_batches:
            self.nSamples_pDCS = batch_size * N_batches

            if self.verbose >= 2:
                print('Adjusting distribution size: %s' % (self.nSamples_pDCS))

        # Generate random cluster index
        randomClusterIndex = np.vstack([np.random.choice(df_markers_expr_copy.columns.get_level_values('cluster'), size=df_markers_expr_copy.shape[1], replace=False) for i in range(self.nSamples_pDCS)]).astype(int)

        # Generate random cluster configurations and calculate scores (Pkc) of those
        if self.verbose >= 2:
            print('Generating null distribution')

        def process_batch(batch_range):

            if self.verbose >= 2:
                print('\t', batch_range)

            tuples = [(df_marker_cell_type, df_markers_expr_copy, randomClusterIndex[i], self.zScoreCutoff, False, False) for i in batch_range]

            pool = multiprocessing.Pool(processes = self.availableCPUsCount)
            temp_random_df_V = pd.concat(pool.map(self.calculateV, tuples), sort=False, axis=1)
            pool.close()
            pool.join()

            return temp_random_df_V

        random_df_V = pd.concat([process_batch(range(i * batch_size,(i + 1) * batch_size)) for i in range(N_batches)], sort=False, axis=1)

        random_df_V.index = pd.MultiIndex.from_arrays([random_df_V.index.get_level_values(0),
                                                pd.Series(random_df_V.index.get_level_values(1)).replace(dict(zip(range(len(uclusters)), uclusters))).values], 
                                                names=[random_df_V.index.names[0], 'cluster'])

        random_df_V = random_df_V.replace(0., np.nan)

        min_value = np.nanmin(random_df_V.values)
        max_value = np.nanmax(random_df_V.values)

        if self.verbose >= 2:
            print('Min:', min_value, '\t', 'Max:', max_value)

        Nbins = 300

        # Calculate null distribution histograms data for plots
        df_null_distributions = random_df_V.apply(lambda s: scipy.stats.rv_histogram(np.histogram(s, bins=Nbins, range=(min_value,max_value)))._hpdf / Nbins, axis=1).apply(pd.Series).T
        df_null_distributions.columns.names = ['CellType', 'Cluster']
        df_null_distributions.index = [min_value] + (scipy.stats.rv_histogram(np.histogram(random_df_V.iloc[0], bins=Nbins, range=(min_value,max_value)))._hbins).tolist()

        # Calculate z-score (Lkc) for the best clustering
        if self.verbose >= 2:
            print('Processing voting results')
        sigma = pd.Series(data=np.nanstd(random_df_V.values, axis=1), index=random_df_V.index).replace(0., np.inf).unstack()
        mean = pd.Series(data=np.nanmean(random_df_V.values, axis=1), index=random_df_V.index).unstack()
        df_L = (df_V - mean) / sigma
        
        df_L[df_L < self.thresholdForUnknown_pDCS] = 0.
        df_L[df_V < 0.] = 0.

        return (df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, df_null_distributions)

[docs]    def annotateWith_ratio_Scheme(self, df_markers_expr, df_marker_cell_type):
        
        '''Produce cluster annotation results

        Parameters:
            df_markers_expr: pandas.DataFrame 
                Data with marker genes by cells expression

            df_marker_cell_type: pandas.DataFrame 
                Data with marker genes by cell types

        Returns:
            tuple
        
        Usage:
            Function should be called internally only
        '''

        df_positive = df_marker_cell_type.copy()
        df_negative = df_marker_cell_type.copy()
        df_positive[df_marker_cell_type < .0] = 0.
        df_negative[df_marker_cell_type > .0] = 0.
        df_positive /= (df_positive > 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
        df_negative /= (df_negative < 0.).sum(axis=0).fillna(1.).replace(0., 1.).replace(0, 1.)
        df_marker_cell_type = df_positive + df_negative

        if self.verbose >= 2:
            print(df_marker_cell_type.shape)

        # Align df_markers_expr and df_marker_cell_type
        df_markers_expr.sort_index(inplace=True, axis=0)
        df_marker_cell_type.sort_index(inplace=True, axis=1)

        # Calculate score (Vkc) for the best clustering
        temp = self.calculateV((df_marker_cell_type, 
                                df_markers_expr, 
                                df_markers_expr.columns.get_level_values('cluster').values, 
                                self.zScoreCutoff, True, self.removeLowQualityScores))
        df_V = temp[0].unstack()
        df_V.index.name = None
        df_Z_best = temp[1]

        dict_expressed_markers = {cluster:df_Z_best.index[df_Z_best[cluster] > 0].values for cluster in df_Z_best}

        df_marker_cell_type_full = self.readMarkerFile().T.loc[df_marker_cell_type.index]
        df_marker_cell_type_full[df_marker_cell_type_full < .0] = 0.
        df_marker_cell_type_full /= (df_marker_cell_type_full > 0.).sum(axis=0).fillna(1.).replace(0., 1.)

        df_L = df_V / df_marker_cell_type_full.sum(axis=1).values[:, None]

        df_L[df_L < self.thresholdForUnknown_ratio] = 0.
        df_L[df_V < 0.] = 0.

        return (df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, None)

[docs]    def annotateWith_Hopfield_Scheme(self, df_markers_expr, df_marker_cell_type):

        '''Produce cluster annotation results 
                
        Parameters:
                df_markers_expr: pandas.DataFrame
                    Markers expression DataFrame

                df_marker_cell_type: pandas.DataFrame
                    Marker cell type DataFrame

        Returns:
            tuple

        Usage:
            Function should be called internally only
        '''
        
        df_marker_cell_type = df_marker_cell_type.T

        np.random.seed(0)

        # The scheme does not account for negative markers.
        # Therefore remove them
        df_marker_cell_type[df_marker_cell_type > 0.] = 1.
        df_marker_cell_type[df_marker_cell_type < 0.] = 0.

        # Keep markers that have +1 or -1 for at least one cell type
        df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]

        # Check and keep only cell types with at least a minium number of +1 markers
        df_marker_cell_type = df_marker_cell_type[df_marker_cell_type.columns[np.abs(df_marker_cell_type).sum(axis=0) > self.minimumNumberOfMarkersPerCelltype]]

        if self.verbose >= 2:
            print(df_marker_cell_type.shape)

        # Remove unused markers
        df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]

        # Normalize marker/celltype matrix in both direction by counts.
        # Note that order of the following two lines is irrelevant
        df_marker_cell_type[df_marker_cell_type > 0.] /= (df_marker_cell_type[df_marker_cell_type > 0.] > 0).sum(axis=1).replace(0, 1.).values[:, None]
        df_marker_cell_type[df_marker_cell_type > 0.] /= (df_marker_cell_type[df_marker_cell_type > 0.] > 0).sum(axis=0).replace(0, 1.).values
        df_marker_cell_type[df_marker_cell_type < 0.] /= (df_marker_cell_type[df_marker_cell_type < 0.] < 0).sum(axis=1).replace(0, 1.).values[:, None]
        df_marker_cell_type[df_marker_cell_type < 0.] /= (df_marker_cell_type[df_marker_cell_type < 0.] < 0).sum(axis=0).replace(0, 1.).values

        if self.verbose >= 2:
            print(df_marker_cell_type.shape)

        # Order gene expression index in the same order as markers index
        df_markers_expr = df_markers_expr.loc[df_marker_cell_type.index]

        df_expr_cluster_centroids = df_markers_expr.groupby(level='cluster', axis=1, sort=False).mean()
        df_Z = df_expr_cluster_centroids.copy().apply(self.zScoreOfSeries, axis=1)

        df_sig = df_Z > self.zScoreCutoff

        dict_expressed_markers = {cluster:df_sig.index[df_sig[cluster] > 0].values for cluster in df_sig}

        if self.useUnderlyingNetwork:
            try:
                underlyingNetwork = self.getSubnetworkOfPCN(df_marker_cell_type.index.values, min_shared_first_targets=1)

                if self.verbose >= 3:
                    print('Recording underlying network', flush=True)
                underlyingNetwork.to_hdf(self.fileHDFpath, key='df_underlyingNetwork', mode='a', complevel=4, complib='zlib')

                underlyingNetwork = underlyingNetwork.values
            except Exception as exception:
                if self.verbose >= 1:
                    print(exception)
                    print('Not using underlying network connection')
                underlyingNetwork = None
        else:
            underlyingNetwork = None

        if self.verbose >= 2:
            print('Generating Hopfield networks:', flush=True)

        listOfSeries = []

        for i in range(self.nSamples_Hopfield):
            if i % np.int(self.nSamples_Hopfield / 10) == 0:
                if self.verbose >= 2:
                    print('\n%s%%' % (np.int(100 * i / self.nSamples_Hopfield)), end='\t', flush=True)

            # Run Hopfield network dynamics
            hop = self.propagateHopfield(sigma=df_sig.values * 2. - 1., 
                                         xi=df_marker_cell_type.values, 
                                         typesNames=df_marker_cell_type.columns.values, underlyingNetwork=underlyingNetwork,
                                         clustersNames=df_sig.columns.values, id = i, T = self.HopfieldTemperature, 
                                         path = os.path.join(self.saveDir, 'HopfieldTrajectories'),
                                         recordTrajectories = False, verbose=self.verbose)

            # Determine top scores
            res = np.argmax(hop, axis=0)

            # Best votes that below threshold are set to Unknown
            res[np.where(np.max(hop, axis=0) < 10. ** -12)] = len(df_marker_cell_type.columns)

            listOfSeries.append(res)

        if self.verbose >= 2:
            print(flush=True)

        r = np.vstack(listOfSeries)

        df_L = pd.DataFrame(index=list(range(len(df_marker_cell_type.columns) + 1)), columns=df_sig.columns).fillna(0.)
        for i in range(len(df_sig.columns)):
            w = np.unique(r[:,i], return_counts=True)
            for j in range(len(w[0])):
                df_L.iloc[w[0][j], i] = w[1][j] / len(r)

        df_L[df_L < self.thresholdForUnknown_Hopfield] = 0.
        df_L.iloc[-1, :] = 0.9 * self.thresholdForUnknown_Hopfield

        # Best results are chosen as highest frequency cell type for each cluster
        se_celltypes = pd.Series(data = np.argmax(df_L.values, axis=0).astype(int).astype(str), index = df_sig.columns)

        # Define cell names dictionary
        celltypeNames = dict(zip(np.array(range(len(df_marker_cell_type.columns))).astype(int).astype(str), df_marker_cell_type.columns))
        celltypeNames.update({str(len(celltypeNames)): self.nameForUnknown})

        # Convert cell names from index to words
        se_celltypes = se_celltypes.replace(to_replace=celltypeNames)

        df_L.index = pd.Series(df_L.index.astype(str)).replace(to_replace=celltypeNames).values

        df_L.drop(self.nameForUnknown, axis=0, inplace=True)

        return (df_marker_cell_type.T, df_markers_expr, df_L, df_L.copy(), dict_expressed_markers)

[docs]    def recordAnnotationResults(self, df_marker_cell_type, df_markers_expr, df_L, df_V, dict_expressed_markers, df_null_distributions = None):

        '''Record cell type annotation results to spreadsheets.

        Parameters:
            df_marker_cell_type: pandas.DataFrame
                Markers to cell types table

            df_markers_expr: pandas.DataFrame
                Markers expression in each cluster

            df_L: pandas.DataFrame
                Annotation scores along with other information

            df_V: pandas.DataFrame
                Annotation scores along with other information

            dict_expressed_markers: dictionary
                Dictionary of markers signigicantly expressed in each cluster

            df_null_distributions: pandas.DataFrame, Default None
                Table with null distributions

        Returns:
            None

        Usage:
            This function is intended to be used internally only
        '''

        df_L = df_L.fillna(0.)

        # Determine cluster sizes
        se_cluster_sizes = df_markers_expr.groupby(level='cluster', sort=True, axis=1).count().iloc[0]
        cluster_sizes = se_cluster_sizes.values
        
        df_L = df_L[se_cluster_sizes.index]
        df_V = df_V[se_cluster_sizes.index]

        # Rename winning cell types
        T = df_L.index[np.argmax(df_L.values, axis=0)].values
        T[(df_L.values <= 0.).all(axis=0)] = self.nameForUnknown
        T = pd.Index(T) + ' #0'
        for i in range(len(T)):
            T = T.where(~T.duplicated(), T.str.replace(' #%s' % (i), ' #%s' % (i + 1)))
        predicted_celltype = T.values

        # Save score columns names
        columns_scores = df_V.index.values.copy().tolist()
        
        # Determine winning score
        winning_score = np.array([df_V.iloc[ind[0], ind[1]] for ind in np.flip(np.array(list(enumerate(np.argmax(df_L.values, axis=0)))), axis=1)])

        # Update the DataFrames
        df_V.loc['Winning score'] = df_L.loc['Winning score'] = winning_score
        df_V.loc['Predicted cell type'] = df_L.loc['Predicted cell type'] = predicted_celltype
        df_V.loc['# cells in cluster'] = df_L.loc['# cells in cluster'] = cluster_sizes

        # Determine all markers hits for each cluster
        df_custer_centroids = df_markers_expr.groupby(level='cluster', sort=True, axis=1).mean()

        all_markers_in_cluster = [(' // ').join(dict_expressed_markers[cluster].tolist()) for cluster in df_V.columns]
        df_V.loc['All markers'] = df_L.loc['All markers'] = all_markers_in_cluster

        all_supp_markers = {celltype:df_marker_cell_type.T.index[df_marker_cell_type.T[celltype] > 0].values for celltype in columns_scores}
        all_cont_markers = {celltype:df_marker_cell_type.T.index[df_marker_cell_type.T[celltype] < 0].values for celltype in columns_scores}
        all_supp_markers[self.nameForUnknown] = np.array([])
        all_cont_markers[self.nameForUnknown] = np.array([])

        supporting_markers = [(' // ').join((np.intersect1d(all_supp_markers[df_V.loc['Predicted cell type'].loc[cluster].split(' #')[0]],
                                                            df_V.loc['All markers'].loc[cluster].split(' // '))).tolist()) for cluster in df_V.columns]
        contradicting_markers = [(' // ').join((np.intersect1d(all_cont_markers[df_V.loc['Predicted cell type'].loc[cluster].split(' #')[0]],
                                                            df_V.loc['All markers'].loc[cluster].split(' // '))).tolist()) for cluster in df_V.columns]

        df_V.loc['Supporting markers'] = df_L.loc['Supporting markers'] = supporting_markers
        df_V.loc['Contradicting markers'] = df_L.loc['Contradicting markers'] = contradicting_markers
        
        # Record the results to spreadsheets
        fileName = os.path.join(self.saveDir, self.dataName + '_annotation.xlsx')

        if self.verbose >= 2:
            print('Recording voting results to:', fileName)
        writer = pd.ExcelWriter(fileName)
        df_L.columns.name = 'cluster'
        df_L.T.to_excel(writer, 'z-scores', columns=['Predicted cell type', 
                                                     '# cells in cluster', 
                                                     'Winning score', 
                                                     'Supporting markers', 
                                                     'Contradicting markers', 
                                                     'All markers'] + columns_scores)
        df_V.columns.name = 'cluster'
        df_V.T.to_excel(writer, 'Voting scores', columns=['Predicted cell type', 
                                                          '# cells in cluster', 
                                                          'Winning score', 
                                                          'Supporting markers', 
                                                          'Contradicting markers', 
                                                          'All markers'] + columns_scores)

        df_custer_centroids.T.to_excel(writer, 'Cluster centroids')

        if not df_null_distributions is None:
            df_null_distributions.to_excel(writer, 'Null distributions')

        df_marker_cell_type.to_excel(writer, 'Marker cell type weight matrix')

        writer.save()

        return df_V.loc['Predicted cell type'].to_dict()

[docs]    def propagateHopfield(cls, sigma = None, xi = None, T = 0.2, tmax = 200, fractionToUpdate = 0.5, mode = 4, meshSamplingRate = 200,
                            underlyingNetwork = None, typesNames = None, clustersNames = None, printInfo = False,
                            recordTrajectories = True, id = None, printSwitchingFraction = False, path = None, verbose = 0):

        '''Function is used internally to propagate Hopfield network 
        over a set number of time steps
                    
        Parameters:
            sigma: pandas.DataFrame, Default None
                Markers expression

            xi: pandas.DataFrame, Default None
                Marker cell type DataFrame

            T: float, Default 0.2
                Noise (Temperature) parameter

            tmax: int, Default 200
                Number of step to iterate through

            fractionToUpdate: float, Default 0.5
                Fraction of nodes to randomly update at each iteration

            mode: int, Default 4
                Options are:
                    1: non-onthogonalized, non-weighted attractors
                    2: onthogonalized, non-weighted attractors
                    3: onthogonalized, weighted attractors
                    4: onthogonalized, weighted attractors, asymetric and diluted dynamics

            meshSamplingRate: int, Default 100
                Visualization parameter to control the quality of the color mesh near the attractors

            underlyingNetwork: 2d numpy.array, Default None
                Network of underlying connections between genes

            typesNames: list-like, Default None
                Names of cell types

            clustersNames: list-like, Default None
                Names or identifiers of the clusters

            printInfo: boolean, Default False
                Whether to print detailes

            recordTrajectories: boolean, Default True
                Whether to record trajectories data to files

            id: int, Default None
                Identifier of this function call

            printSwitchingFraction: boolean, Default False
                Whether to print fraction of clusters that switch theie 
                maximum overlapping attractor

            path: str, Default None
                Path for saving trajectories data

        Returns:
            2d numpy.array 
                Overlaps 

        Usage:
            result = propagateHopfield(sigma=sigma, xi=df_attrs)
        '''
          
        if xi is None:
            if verbose >= 3:
                print('xi is None')

            return

        if not path is None:           
            if not os.path.exists(path):
                os.makedirs(path)

        if mode==1 or mode==2:
            xi[xi > 0.] = 1.
            xi[xi <= 0.] = -1.

        if printInfo:
            if verbose >= 3:
                print('Unique nodes across Hopfield network:', len(np.unique(xi, axis=0)))

        if mode == 1:
            A = xi.T
            J = (xi).dot(A) / (xi.shape[0]) # m,m
        else:
            Q = (xi.T).dot(xi)              # n,n

            try:
                Q_inv = np.linalg.inv(Q)    # n,n
            except Exception as exception:
                if verbose >= 3:
                    print(exception)
                    print('Cannot invert matrix Q')

                return

            A = (Q_inv).dot(xi.T)           # n,m <-- xi_inv
            J = (xi).dot(A)                 # m,m

        if not underlyingNetwork is None:
            if J.shape == underlyingNetwork.shape:
                if verbose >= 3:
                    print('Using network of underlying interactions')
                J *= underlyingNetwork
            else:
                if verbose >= 3:
                    print('Network of underlying interactions shape is inconsistent with J. Ignoring it')

        getHopfieldEnergy = lambda s: -0.5 * s[None, :].dot(J).dot(s)

        if recordTrajectories:
            pca = PCA(n_components=None).fit(A)

            attrs = pca.transform(A)
            variance = pca.explained_variance_ratio_

            if id == 0 or id is None:
                if xi.shape[1] == 2:
                    dim = 2
                    lim = 15
                    g = np.linspace(-lim, lim, num=100)
                    mesh_coords = np.meshgrid(*tuple([g]*dim))
                    mesh_coords = np.vstack([g.flatten() for g in mesh_coords]).T
                    
                    mesh = np.zeros((mesh_coords.shape[0], xi.shape[1]))
                    mesh[:, :len(mesh_coords.T)] = mesh_coords

                    mesh_sigma = pca.inverse_transform(mesh)

                    if False:
                        mesh_sigma[mesh_sigma > 0.] = 1.
                        mesh_sigma[mesh_sigma <= 0.] = -1.
                        digital_mesh = pca.transform(mesh_sigma)
                        mesh_coords = digital_mesh
                    
                    mesh_energy = np.array([getHopfieldEnergy(s) for s in mesh_sigma])
                    write(np.hstack([mesh_coords[:,:2], mesh_energy]), os.path.join(path, 'mesh'))
                else:
                    if verbose >= 4:
                        print('Generating samples in the vicinity of attractors')
                    temp = []
                    for i in range(xi.shape[1]):
                        for j in range(25):
                            st = A[i]
                            temp.append(np.array([st * ((np.random.rand(st.shape[0]) > 0.025*j) * 2. - 1.) for i in range(meshSamplingRate)]))
                    mesh_sigma = np.vstack(temp)

                    if verbose >= 4:
                        print('Calculating energy of the samples')
                    if False:
                        mesh_energy = np.array([getHopfieldEnergy(s) for s in mesh_sigma])
                    else:
                        if verbose >= 4:
                            print('Calculating chunks')

                        def func(mesh_sigma):

                            return -0.5 * ((J).dot(mesh_sigma.T) * mesh_sigma.T).sum(axis=0)[:,None]

                        mesh_energy = np.vstack([func(item) for item in np.split(mesh_sigma, meshSamplingRate)])

                    if verbose >= 4:
                        print('\nProjecting samples on PCs')
                    mesh_coords = pca.transform(mesh_sigma)

                    if verbose >= 4:
                        print('Recording mesh data')
                    write(np.hstack([mesh_coords, mesh_energy]), os.path.join(path, 'mesh'))

        if recordTrajectories:
            write((np.vstack([attrs, variance]), typesNames), os.path.join(path, 'attrs'))

        if sigma is None:
            if verbose >= 4:
                print('Sigma is None')

            return

        if recordTrajectories:
            trajectories = np.zeros((tmax, sigma.shape[1], xi.shape[1]))
                    
        initial = np.argmax((A).dot(sigma), axis=0)
        initial[np.where(np.max((A).dot(sigma), axis=0) < 10. ** -12)] = -1

        if printInfo:
            sIn = sigma.copy()

        for t in range(tmax): 
            h = (J).dot(sigma)
                
            if recordTrajectories:
                if id == 0 or id is None:
                    trajectories[t,:,:] = np.hstack([pca.transform(sigma.T)])

            sigmaNew = ((1. / (1. + np.exp(-2. * h / T))) > np.random.rand(*sigma.shape)) * 2. - 1.

            # Make dynamics asymmetric and diluted: Deactivate nodes that are being turned off
            if mode == 4:
                sigmaNew[(sigmaNew - sigma) < -1.] = 0.

            f = 1. if t >= 100 else max(0.05, float(t) / 100.)

            whereToUpdate = (np.random.rand(*sigma.shape) <= f * fractionToUpdate)
            sigma[whereToUpdate] = sigmaNew[whereToUpdate] * np.abs(sigma[whereToUpdate])

        if printInfo:
            print('In positive:', ((sIn>0)*1.).sum(axis=0))
            print('Out positive:', ((sigma>0)*1.).sum(axis=0))
            print('Positive common in-out:', (((sIn>0.)*(sigma>0.))*1.).sum(axis=0))

        final = np.argmax((A).dot(sigma), axis=0)
        final[np.where(np.max((A).dot(sigma), axis=0) < 10. ** -12)] = -1

        if printSwitchingFraction:
            print(np.round(100. * (final != initial).sum() / (len(initial)), 2).astype(int), end=' ', flush=True)

        if recordTrajectories:
            if id == 0 or id is None:
                write(trajectories, os.path.join(path, 'trajectories%s')%(id))
                write((initial, final, typesNames, clustersNames), os.path.join(path, 'additional'))

        overlap = (A).dot(sigma)
        overlap[overlap <= 0.] = 0.

        return overlap


    # Other functions of class #########################################################
[docs]    @classmethod
    def convertColormap(cls, colormap):

        '''Convert colormap from the form (1.,1.,1.,1.) to 'rgba(255,255,255,1.)'

        Parameters:
            colormap: dictionary
                Colormap to convert

        Returns:
            dictionary
                Converted colomap
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            colormap = DCS.convertColormap(colormap)
        '''

        return {k:'rgba' + str(tuple(list((np.array(v[:3]) * 255).astype(int)) + [v[3]])) for k, v in colormap.items()}

[docs]    @classmethod
    def zScoreOfSeries(cls, se):
    
        '''Calculate z-score of pandas.Series and modify the Series in place
    
        Parameters:
            se: pandas.Series
                Series to process

        Returns:
            pandas.Series
                Processed series

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            se = DCS.zScoreOfSeries(se)
        '''

        se.iloc[:] = scipy.stats.zscore(se.values)

        return se
       
[docs]    @classmethod
    def KeyInFile(cls, key, file):

        '''Check is a key exists in a HDF file.
    
        Parameters:
            key: str
                Key name to check

            file: str
                HDF file name to check

        Returns:
            boolean
                True if the key is found
                False otherwise

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.KeyInFile('df_expr', 'data/file.h5')
        '''

        if not os.path.isfile(file):
            return False 

        with pd.HDFStore(file) as file:
            return True if "/" + key.strip("/") in file.keys() else False

        return

[docs]    def getSubnetworkOfPCN(self, subnetworkGenes, min_shared_first_targets = 30):

        '''Extract subnetwork of PCN network

        Parameters:
            subnetworkGenes: list-like
                Set of genes that the subnetwork should contain

            min_shared_first_targets: int, Default 30
                Number of minimum first shared targets to connect two nodes

        Returns:
            pandas.DataFrame
                Adjacency matrix

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            df_subnetwork = DCS.getSubnetworkOfPCN(genes)
        '''

        if self.verbose >= 2:
            print('\nReading PCN (directed, unweighted) network from file')
        PCN = pd.read_csv(os.path.join('data', 'PCN_%s.txt.gz' % (self.species)), compression='gzip', header=0, index_col=[0], delimiter='\t')['Target']

        PCN.index = self.gnc.Convert(PCN.index.tolist(), 'alias', 'hugo', returnUnknownString=False)
        PCN[:] = self.gnc.Convert(PCN.values.tolist(), 'alias', 'hugo', returnUnknownString=False)

        if self.verbose >= 2:
            print('Calculating targets of PCN network genes')
        targets = PCN.groupby(level=0).agg('unique').apply(set).to_dict()

        del PCN

        index = pd.Index(set(subnetworkGenes).intersection(set(targets.keys()))).sort_values()

        if self.verbose >= 2:
            print('Number of PCN genes: %s'%(len(index)))
            print('Calculating number of common targets for %s genes'%(len(index)))

        data = np.zeros((len(index), len(index))).astype(int)

        for iA in range(len(index)):
            geneA = index[iA]

            for iB in range(iA, len(index)):
                geneB = index[iB]
                direct = 1 if (geneB in targets[geneA]) else 0
                data[iA,iB] = data[iB,iA] = len(targets[geneA].intersection(targets[geneB])) + direct

        targetsOverlaps = pd.DataFrame(index=index, columns=index, data=data)

        subnetwork = 1.*(targetsOverlaps >= min_shared_first_targets)

        if self.verbose >= 2:
            print(subnetwork.shape)

        subnetwork = subnetwork.reindex(subnetworkGenes, axis=0).reindex(subnetworkGenes, axis=1).fillna(0.)

        if self.verbose >= 2:
            print(subnetwork.shape)

        con = subnetwork.sum().sum()
        tot = (len(subnetwork)-1)*len(subnetwork)

        if self.verbose >= 2:
            print('\nConnections in subnetwork: %s, out of %s possible (%s%%)\n'%(con, tot, np.round(100.*con/tot, 0)))

        return subnetwork

[docs]    def alignSeries(self, se1, se2, tagForMissing):

        '''Align two pandas.Series

        Parameters:
            se1: pandas.Series
                Series with the first set of items

            se2: pandas.Series 
                Series with the second set of items

            tagForMissing: str, Default 'Missing'
                Label to assign to non-overlapping items

        Returns:
            pandas.DataFrame
                Contains two aligned pandas.Series
        
        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            df = DCS.alignSeries(pd.Index(['A', 'B', 'C', 'D']).to_series(), pd.Index(['B', 'C', 'D', 'E', 'F']).to_series())
        '''
        
        se1.index.name = 'index'
        se2.index.name = 'index'

        append = lambda se1, se2: pd.concat([se1, pd.Series(index=se2.index.difference(se1.index), data=[tagForMissing] * len(se2.index.difference(se1.index)))], axis=0, sort=False)

        se1 = append(se1, se2)
        se2 = append(se2, se1)

        se1.name = 'se1'
        se2.name = 'se2'

        return pd.concat((se1, se2.loc[se1.index]), axis=1, sort=True)

[docs]    def createReverseDictionary(self, inputDictionary):

        '''Efficient way to create a reverse dictionary from a dictionary.
        Utilizes Pandas.Dataframe.groupby and Numpy arrays indexing.
    
        Parameters: 
            inputDictionary: dictionary
                Dictionary to reverse

        Returns:
            dictionary
                Reversed dictionary

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            revDict = DCS.createReverseDictionary(Dict)
        '''

        keys, values = np.array(list(inputDictionary.keys())), np.array(list(inputDictionary.values()))
        df = pd.DataFrame(np.array([[keys[i], value] for i in range(len(keys)) for value in values[i]]))
        dfGrouped = df.groupby(df.columns[1])
        keys, values = list(dfGrouped.indices.keys()), list(dfGrouped.indices.values())
        GOs = df.values.T[0]

        return dict(zip(keys, [GOs[value].tolist() for value in values]))

[docs]    def readMarkerFile(self, mergeFunction = 'mean', mergeCutoff = 0.25):

        '''Read markers file, prepare markers

        Parameters: 
            mergeCutoff: str, Default 'mean'
                Function used for grouping of the cell sub-types. Options are:
                    'mean': average of the values
                    'max': maxium of the values, effectively a logiacal OR function

            mergeCutoff: float, Default 0.25
                Values below cutoff are set to zero. 
                This option is used if mergeCutoff is 'mean'

        Returns:
            pandas.DataFrame
                Celltype/markers matrix

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            df_marker_cell_type = DCS.readMarkerFile()
        '''

        df_marker_cell_type = pd.read_excel(self.geneListFileName, index_col=0, header=[0,1]).replace(np.nan, 0.).replace(0, 0.)
        df_marker_cell_type.columns.names = ['CellTypeGrouped', 'CellType']

        df_marker_cell_type.index = self.gnc.Convert(list(df_marker_cell_type.index), 'alias', 'hugo', returnUnknownString=False)

        df_marker_cell_type = df_marker_cell_type.drop(columns=[col for col in df_marker_cell_type.columns if col[0] == 'NA'])
        df_marker_cell_type = df_marker_cell_type.groupby(level='CellTypeGrouped', axis=1).agg(mergeFunction).fillna(0.)

        where_positive = df_marker_cell_type >= mergeCutoff
        where_negative = df_marker_cell_type <= -mergeCutoff
        df_marker_cell_type *= 0.
        df_marker_cell_type[where_positive] = 1.
        df_marker_cell_type[where_negative] = -1.

        df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]

        if self.verbose >= 2:
            print('Markers/celltypes:', df_marker_cell_type.shape, flush=True)

        # Merge duplicates that might have appeared after gene name conversion
        df_marker_cell_type = df_marker_cell_type.groupby(level=0, axis=0).sum()
        df_marker_cell_type[df_marker_cell_type > 0.] = 1.
        df_marker_cell_type[df_marker_cell_type < 0.] = -1.

        def norm(s_input):
            
            s = s_input.copy()

            pos_sum = np.abs(s.iloc[np.where(s > 0.)].sum())
            neg_sum = np.abs(s.iloc[np.where(s < 0.)].sum())
            
            s.iloc[np.where(s > 0.)] /= pos_sum if pos_sum > 0. else 1.
            s.iloc[np.where(s < 0.)] /= neg_sum if neg_sum > 0. else 1.

            return s

        df_marker_cell_type = df_marker_cell_type.apply(norm, axis=0)

        if self.verbose >= 1:
            print('Markers/celltypes:', df_marker_cell_type.shape, flush=True)

        df_marker_cell_type.columns = df_marker_cell_type.columns.str.strip()

        if not self.useNegativeMarkers:
            df_marker_cell_type[df_marker_cell_type < 0.] = 0.
            df_marker_cell_type = df_marker_cell_type.loc[np.abs(df_marker_cell_type).sum(axis=1) > 0]

            if self.verbose >= 2:
                print('Removed negative markers. Markers/celltypes:', df_marker_cell_type.shape, flush=True)

        return df_marker_cell_type
    
[docs]    def mergeIndexDuplicates(cls, df_expr, method = 'average', printDuplicates = False, verbose = 1):

        '''Merge index duplicates

        Parameters: 
            df_expr: pandas.DataFrame
                Gene expression table

            method: str, Default None
                How to deal with index duplicates. Option are:
                    'average': average values of duplicates

                    'first': keep only first of duplicates, discard rest

        Returns:
            pandas.DataFrame
                Gene expression table

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            df_expr = DCS.mergeIndexDuplicates(df_expr)
        '''

        # Check is any names in the index are duplicated, remove duplicates
        len_total, len_unique = len(df_expr.index), len(np.unique(df_expr.index))

        if len_total != len_unique:
            unique_items = np.unique(df_expr.index, return_counts=True)

            if method == 'average':
                df_expr = df_expr.groupby(level=0, axis=0).sum()

            elif method == 'first':
                df_expr = df_expr.loc[~df_expr.index.duplicated(keep='first')]

            else:
                if verbose >= 1:
                    print('Unknown method')

                return df_expr

            if verbose >= 1:
                print('Merged %s duplicated items in the index of size %s' % (len_total - len_unique, len_total), flush=True)
            
            if printDuplicates:
                print(unique_items[0][unique_items[1] > 1], flush=True)

        return df_expr

[docs]    def recordExpressionData(self):

        '''Record expression data from the internal HDF storage.

        Parameters: 
            None

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.recordExpressionData()
        '''

        if self.verbose >= 1:
            print('Recording compressed DataFrame', flush=True)

        self._df_expr.replace(0, np.nan).T.stack().to_frame().to_hdf(self.fileHDFpath, key='df_expr', mode='a', complevel=4, complib='zlib')

        return

[docs]    def loadAnnotatedLabels(self, detailed = False, includeLowQC = True, infoType = 'label'):

        '''Load cell annotations resulted from function 'annotate'

        Parameters: 
            detailed: boolean, Default False
                Whether to give cluster- or celltype- resolution data

            includeLowQC: boolean, Default False
                Whether to include low quality cells in the output

        Returns:
            pandas.Series

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.loadAnnotatedLabels()
        '''

        if self.KeyInFile('df_markers_expr', self.fileHDFpath):
            try:
                se = pd.read_hdf(self.fileHDFpath, key='df_markers_expr').T.reset_index().set_index(['batch', 'cell'])['label']
            except Exception as exception:
                if self.verbose >= 1:
                    print(exception)
                    print('Error reading annotation results', flush=True)

                return

            if not detailed:
                se = se.str.split(' #', expand=True)[0]

        else:
            if self.verbose >= 1:
                print('Annotation results not found', flush=True)

            return

        if includeLowQC:
            if self.KeyInFile('df_projection_pre_QC', self.fileHDFpath):
                allCells = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC').columns
                allBathes = allCells.get_level_values('batch')
            elif self.KeyInFile('df_projection', self.fileHDFpath):
                allCells = pd.read_hdf(self.fileHDFpath, key='df_projection').columns
                allBathes = allCells.get_level_values('batch')
            else:
                if self.verbose >= 1:
                    print('Labelled data not found', flush=True)

                return

            if infoType=='label':
                se = se.reindex(allCells, fill_value=self.nameForLowQC)
            elif infoType=='batch':
                se_low = pd.Series(data=allBathes, index=allCells)
                se.iloc[:] = se.index.get_level_values('batch').values
                se = pd.concat([se, se_low.loc[se_low.index.difference(se.index)]])

        return se
    
[docs]    def loadExpressionData(self):

        '''Load processed expression data from the internal HDF storage.

        Parameters: 
            None

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.loadExpressionData()
        '''

        if self.KeyInFile('df_expr', self.fileHDFpath):
            if self.verbose >= 2:
                print('Loading processed data', flush=True)

            self._df_expr = pd.read_hdf(self.fileHDFpath, key='df_expr').unstack(level=-1, fill_value=0.).T
            self._df_expr.index = self._df_expr.index.get_level_values(-1)
            self._df_expr.sort_index(inplace=True)
        else:
            if self.verbose >= 1:
                print('Processed data not found')

        return

[docs]    def prepareMarkers(self, expressedGenes = None, createColormapForCelltypes = True):

        '''Get dictionary of markers for each cell types.
        
        Parameters:
            expressedGenes: pandas.Index, Default None
                If not None then the marker DataFrame will be intersected with this index,
                i.e. all non-expressed genes will be filtered from the marker file

            createColormapForCelltypes: boolean, Default True
                Create (or update) a colormap for cell types
                based on a marker-celltype matrix. This will make coloring of cell clusters
                consistent across all plots.

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.prepareMarkers()
        '''

        df_marker_cell_type = self.readMarkerFile()

        # Remove non-expressed markers
        if not expressedGenes is None:
            df_marker_cell_type = df_marker_cell_type.loc[df_marker_cell_type.index.intersection(expressedGenes)]

            if self.verbose >= 2:
                print('Removed non-expressed genes. Markers/celltypes shape:', df_marker_cell_type.shape, flush=True)

        # Remove underepresented cell types
        df_marker_cell_type = df_marker_cell_type[df_marker_cell_type.columns[(df_marker_cell_type > 0.).sum(axis=0) >= self.minimumNumberOfMarkersPerCelltype]]
        df_marker_cell_type = df_marker_cell_type[(df_marker_cell_type != 0).sum(axis=1) >= 1]
        df_marker_cell_type.index.name = 'Marker'

        df_marker_cell_type.to_hdf(self.fileHDFpath, key='df_marker_cell_type', mode='a', complevel=4, complib='zlib')

        # Create (or update) a colormap for cell types
        if createColormapForCelltypes:
            with open(os.path.join(self.saveDir, 'ColormapForCellTypes.txt'), 'w') as tempMapFile:
                df_marker_cell_type = pd.read_hdf(self.fileHDFpath, key='df_marker_cell_type')

                cellTypes = np.sort(df_marker_cell_type.columns.values.tolist())

                for cellTypeIndex in range(len(cellTypes)):
                    tempMapFile.write(cellTypes[cellTypeIndex] + '\t' + str(cm.jet(cellTypeIndex / len(cellTypes))) + '\n')

                tempMapFile.write(self.nameForUnknown + '\t' + str(cm.jet(1.0)) + '\n')

        return
    
[docs]    def calculateQCmeasures(self):

        '''Calculate Quality Control (QC) measures

        Parameters:
            None

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.calculateQCmeasures()
        '''

        if self.mitochondrialGenes is None:
            if self.species == 'Human':
                mitoGenesPath = os.path.join(self.defaultGeneListsDir, 'Human.MitoCarta2.0.csv')
            elif self.species == 'Mouse':
                mitoGenesPath = os.path.join(self.defaultGeneListsDir, 'Mouse.MitoCarta2.0.csv')
            else:
                if self.verbose >= 1:
                    print('Mitochondrial genes for scpecies %s not found' % (self.species))
                    print('Implemented lists are for Human and Mouse')

                raise NotImplementedError

            self.mitochondrialGenes = pd.read_csv(mitoGenesPath, index_col=None, header=0)['Symbol'].values.squeeze().tolist()

        if self.verbose >= 2:
            print('Calculating quality control measures (count depth, number of genes, fraction of mitochondrial genes) for each cell', flush=True)

        df_QC = pd.concat([(self._df_expr).sum(axis=0), 
                           (self._df_expr > 0).sum(axis=0), 
                           (self._df_expr.loc[self._df_expr.index.intersection(self.mitochondrialGenes)] > 0).sum(axis=0) / \
                              (self._df_expr > 0).sum(axis=0)], axis=1, sort=False)

        df_QC.columns = ['count_depth', 'number_of_genes', 'fraction_of_mitochondrialGenes']

        df_QC.to_hdf(self.fileHDFpath, key='QC', mode='a', complevel=4, complib='zlib')

        return

[docs]    def qualityControl(self, **kwargs):

        '''Remove low quality cells

        Parameters:
            None

        Returns:
            Any parameters that function 'getIndexOfGoodQualityCells' can accept

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.qualityControl()
        '''

        index = self.getIndexOfGoodQualityCells(**kwargs)

        if self.toggleRemoveLowQualityCells:
            self._df_expr = self._df_expr[self._df_expr.columns.intersection(index).sort_values()]
            self._df_expr = self._df_expr[self._df_expr.sum(axis=1) > 0]

            if self.verbose >= 1:
                print('Removed low quality cells. Data size: %s genes, %s cells' % self._df_expr.shape, flush=True)

            df_projection = pd.read_hdf(self.fileHDFpath, key='df_projection_pre_QC')
            df_projection[self._df_expr.columns].to_hdf(self.fileHDFpath, key='df_projection', mode='a', complevel=4, complib='zlib')

            self.project(PCAonly=True)

        return

[docs]    def batchEffectCorrection(self, method = 'COMBAT'):

        '''Batch effect correction.

        Parameters:
            method: str, Default 'COMBAT'
                Stein, C.K., Qu, P., Epstein, J. et al. Removing batch effects from purified 
                plasma cell gene expression microarrays with modified ComBat. 
                BMC Bioinformatics 16, 63 (2015)

        Returns:
            None

        Usage:
            DCS = DigitalCellSorter.DigitalCellSorter()

            DCS.batchEffectCorrection()
        '''

        if method == 'COMBAT':
            cells = self._df_expr.columns.get_level_values('cell').values.copy()
            patients = self._df_expr.columns.get_level_values('batch').values.copy()

            if len(np.unique(patients)) == 1:
                if self.verbose >= 2:
                    print('Only one batch provided. Batch correction is not necessary', flush=True)
            else:
                if self.verbose >= 1:
                    print('ComBat transformation', flush=True)

                if self.verbose >= 2:
                    print('Reading positions of zeros', flush=True)
                where_zeros = self._df_expr == 0.
                values = combat(pd.DataFrame(data=self._df_expr.values.copy(), index=self._df_expr.index.values.copy(), columns=range(len(cells))), pd.Series(data=patients, index=range(len(cells)))).values
                self._df_expr = pd.DataFrame(data=values, index=self._df_expr.index, columns=self._df_expr.columns)

                if self.verbose >= 2:
                    print('Setting original zeros back to zeros', flush=True)

                self._df_expr[where_zeros] = 0.

                if self.verbose >= 2:
                    print('Setting negative values to zeros', flush=True)

                self._df_expr[self._df_expr < 0.0] = 0.

        else:
            if self.verbose >= 1:
                print('Batch effect correction method unknown')

        return