Demo¶

The demo can be executed after installation of the DigitalCellSorter. In the terminal run the command, and follow the prompts:

python -m DigitalCellSorter

The demo script is detailed below:

import sys
sys.path.append("..")

import os
import urllib.request
import DigitalCellSorter
import DigitalCellSorter.ReadPrepareDataHCA as prep

if __name__ == '__main__':

    print('This is a large dataset demo.\nFor the "5k PBMC demo" run "python -m DigitalCellSorter"\n')

    here = os.path.dirname(__file__)

    url = "https://data.humancellatlas.org/project-assets/project-matrices/cc95ff89-2e68-4a08-a234-480eca21ce79.homo_sapiens.mtx.zip"
    extractPath = os.path.join(here, 'data', os.path.splitext(os.path.basename(url))[0])

    ## Download and unpack data
    #prep.getHCAdataByURL(url, extractPath)

    # Call function recordFilesOfIndividualDonors to load the data from HCA Data Portal
    #id = prep.recordFilesOfIndividualDonors(extractPath, organName='bone marrow')[0]
    id = '085e737d-adb5-4597-bd54-5ebeda170038'

    # Get the data. The file will be downloaded from github if not found locally
    try:
        if not os.path.exists(extractPath):
            os.makedirs(extractPath)

        if not os.path.isfile(os.path.join(extractPath, 'dfDonorID %s.h5' % id)):
            print('Downloading 110 Mb data file (50000 cells)')
            temp = 'https://github.com/sdomanskyi/DigitalCellSorter/raw/master/data/dfDonorID %s.h5' % id
            urllib.request.urlretrieve(temp.replace(' ', '%20'), os.path.join(extractPath, 'dfDonorID %s.h5' % id))
    except Exception as exception:
        print('Could not download the file\n', exception)
        exit()

    # Load gene expression data from h5 file
    df_expr = prep.getDataframeByDonorID(extractPath, id)
    df_expr.columns.names = ['batch', 'cell']

    # Create an instance of class DigitalCellSorter. 
    # Here we use Default parameter values for most of the parameters
    DCS = DigitalCellSorter.DigitalCellSorter(dataName='BM1', geneNamesType = 'ensembl',
                                              saveDir=os.path.join(here, 'output', 'BM1', ''),
                                              geneListFileName='CIBERSORT_LM22_7')

    # Validate the expression data, so that it has correct form
    DCS.prepare(df_expr)

    # Delete df_expr as now DCS contains the master copy of it
    del df_expr

    # Process the expression data, i.e. quality control, dimensionality reduction, clustering
    DCS.process()

    # Load marker genes and annotate cells
    DCS.annotate()

    # Make plots of annotated data
    DCS.visualize()
    
    # Make CD19 gene expression plot
    for name in DCS.getHugoName('CD19'):
        DCS.makeIndividualGeneExpressionPlot(name)
            
    # Make CD33 gene expression plot
    for name in DCS.getHugoName('CD33'):
        DCS.makeIndividualGeneExpressionPlot(name)
    
    # Further analysis can be done on cell types of interest, e.g. here 'T cell' and 'B cell'.
    # Let's create a new instance of DigitalCellSorter to run "sub-analysis" with it.
    # It is important to disable Quality control, because the low quality cells have 
    # already been identified and filtered with DCS.
    # Parameter dataName points to the location processed with DCS. 
    DCSsub = DigitalCellSorter.DigitalCellSorter(dataName='BM1', 
                                                 nClusters=10, 
                                                 doQualityControl=False,
                                                 layout='PHATE',
                                                 subclusteringName='T cell')

    # Modify a few other attributes
    DCSsub.saveDir = os.path.join(here, 'output', 'BM1', 'subclustering T cell', '')
    DCSsub.geneListFileName = os.path.join(here, 'docs', 'examples', 'CIBERSORT_T_SUB.xlsx')

    # Get index of T cells
    indexOfTcells = DCS.getCells(celltype='T cell')

    # Get expression of these T cells using their index
    df_expr = DCS.getExprOfCells(indexOfTcells)

    # Insert expression data into DCSsub
    DCSsub.prepare(df_expr)

    # Process subtype 'T cell'
    DCSsub.process(dataIsNormalized=True)

    # Load marker genes and annotate cells
    DCSsub.annotate()

    # Make plots of annotated data
    DCSsub.visualize()