Demo¶

import sys
sys.path.append("..")

import os
import DigitalCellSorter
import DigitalCellSorter.ReadPrepareDataHCA as prep

if __name__ == '__main__':

    here = os.path.dirname(__file__)

    url = "https://data.humancellatlas.org/project-assets/project-matrices/cc95ff89-2e68-4a08-a234-480eca21ce79.homo_sapiens.mtx.zip"
    extractPath = os.path.join(here, 'data', os.path.splitext(os.path.basename(url))[0])

    # Download and unpack data
    prep.getHCAdataByURL(url, extractPath)

    # Call function recordFilesOfIndividualDonors to load the data from HCA Data Portal
    id = prep.recordFilesOfIndividualDonors(extractPath, organName='bone marrow')[0]

    # Load gene expression data from h5 file
    df_expr = prep.getDataframeByDonorID(extractPath, id)
    df_expr.columns.names = ['batch', 'cell']

    # Create an instance of class DigitalCellSorter. 
    # Here we use Default parameter values for most of the parameters
    DCS = DigitalCellSorter.DigitalCellSorter(dataName='BM1', 
                                              saveDir=os.path.join(here, 'output', 'BM1', ''),
                                              geneListFileName='CIBERSORT_LM22_7')

    # Validate the expression data, so that it has correct form
    DCS.prepare(df_expr)

    # Delete df_expr as now DCS contains the master copy of it
    del df_expr

    # Process the expression data, i.e. quality control, dimensionality reduction, clustering
    DCS.process()

    # Load marker genes and annotate cells
    DCS.annotate()

    # Make plots of annotated data
    DCS.visualize()
    
    # Make CD19 gene expression plot
    for name in DCS.getHugoName('CD19'):
        DCS.makeIndividualGeneExpressionPlot(name)
            
    # Make CD33 gene expression plot
    for name in DCS.getHugoName('CD33'):
        DCS.makeIndividualGeneExpressionPlot(name)
    
    # Further analysis can be done on cell types of interest, e.g. here 'T cell' and 'B cell'.
    # Let's create a new instance of DigitalCellSorter to run "sub-analysis" with it.
    # It is important to disable Quality control, because the low quality cells have 
    # already been identified and filtered with DCS.
    # Parameter dataName points to the location processed with DCS. 
    DCSsub = DigitalCellSorter.DigitalCellSorter(dataName='BM1', 
                                                 nClusters=10, 
                                                 doQualityControl=False,
                                                 layout='PHATE',
                                                 subclusteringName='T cell')

    # Modify a few other attributes
    DCSsub.saveDir = os.path.join(here, 'output', 'BM1', 'subclustering T cell', '')
    DCSsub.geneListFileName = os.path.join(here, 'docs', 'examples', 'CIBERSORT_T_SUB.xlsx')

    # Get index of T cells
    indexOfTcells = DCS.getCells(celltype='T cell')

    # Get expression of these T cells using their index
    df_expr = DCS.getExprOfCells(indexOfTcells)

    # Insert expression data into DCSsub
    DCSsub.prepare(df_expr)

    # Process subtype 'T cell'
    DCSsub.process(dataIsNormalized=True)

    # Load marker genes and annotate cells
    DCSsub.annotate()

    # Make plots of annotated data
    DCSsub.visualize()