import os
import copy
import h5py
import numpy as np
import pandas as pd
from scipy import io
import urllib.request
from .GenericFunctions import write, read, extractFromZipOfGz
[docs]def getHCAdataByURL(url, extractPath, extractData = True):
'''Download and extract data from Human Cell Atlas Portal
Parameters:
url: str
URL of the data of interest
extractPath: str
Path where to save and extract data to
extractData: boolean, Default True
Whether to extract downloaded data
Returns:
None
Usage:
getHCAdataByURL(url, extractPath)
'''
# Create directories for extract path
if not os.path.exists(extractPath):
print('Creating directories:\n', extractPath)
os.makedirs(extractPath)
else:
print('Extraction directory already exists. To re-extract files remove the directory:\n', extractPath)
return
# Download the *.gz file
if not os.path.isfile(os.path.join(os.path.dirname(extractPath), os.path.basename(url))):
print('Downloading file:', url)
urllib.request.urlretrieve(url, os.path.join(os.path.dirname(extractPath), os.path.basename(url)))
if extractData:
# Extract file to extract path
extractFromZipOfGz(extractPath + '.zip', removeDownloadedZipFile=True)
for tempName in ['cells', 'genes', 'barcodes']:
write(pd.read_csv(os.path.join(extractPath, tempName + '.tsv'), delimiter='\t', index_col=0, header=0),
os.path.join(extractPath, tempName))
return
[docs]def recordFilesOfIndividualDonors(extractPath, organName=None, donorIDcolumn='donor_organism.provenance.document_id',
organColumn='derived_organ_parts_label', useHogoGeneNames=True):
'''Record h5 files of HCA individual donors in a dataset
Parameters:
extractPath: str
Path of directories where HCA matrix files were downloaded and extracted.
See function getHCAdataByURL() for detail.
organName: str, Default None
Name of the organ name. E.g. 'pancreas', 'bone marrow', etc.
donorIDcolumn: str, Default donor_organism.provenance.document_id
Column with unique IDs of donors in the file. Another option is
'specimen_from_organism.provenance.document_id' IDs at samples level
is needed.
organColumn: str, Default 'derived_organ_parts_label'
'derived_organ_label'
'derived_organ_parts_label'
This option is ignored when organName parameter is None.
useHogoGeneNames: boolean, Default True
Whether to use HUGO gene names.
Returns:
list
List of donor IDs
Usage:
recordFilesOfIndividualDonors(extractPath, organName='retina')
'''
print('extractPath:', extractPath)
if not os.path.isfile(os.path.join(extractPath, 'cells' + '.pklz')):
print('Data files not found. Download and prepare them by calling function getHCAdataByURL()')
return
cells = read(os.path.join(extractPath, 'cells'))
if organName is None:
subsetOfCells = cells
else:
subsetOfCells = cells.iloc[np.where(cells[organColumn]==organName)[0]]
donorIDs = np.unique(subsetOfCells[donorIDcolumn].values).tolist()
dataSparseMatrix = None
for donorID in donorIDs:
print('Donor:', donorID)
if not os.path.isfile(os.path.join(extractPath, 'dfDonorID %s'%(donorID) + '.h5')):
if dataSparseMatrix is None:
print('Reading data from *.mtx file')
dataSparseMatrix = io.mmread(os.path.join(extractPath, 'matrix.mtx')).tocsr()
# Identifiers of cells of single Bone Marrow donor
if organName is None:
idxDonorID = np.where(cells[donorIDcolumn]==donorID)[0]
else:
idxDonorID = np.where((cells[organColumn]==organName) & (cells[donorIDcolumn]==donorID))[0]
# Ensembl to HUGO conversion
genes = read(os.path.join(extractPath, 'genes'))['featurename']
# Read all data and keep only selected cells
dfDonorID = pd.DataFrame(data=dataSparseMatrix[:,idxDonorID].todense(),
index=genes.index if not useHogoGeneNames else genes.values,
columns=pd.MultiIndex.from_arrays([[donorID]*len(idxDonorID), cells.index[idxDonorID].values]))
print(dfDonorID.shape)
# Merge duplicates that might have appeared after gene name conversion
dfDonorID = dfDonorID.groupby(level=0, axis=0).sum()
print(dfDonorID.shape)
# Drop all-zero genes
dfDonorID = dfDonorID.loc[dfDonorID.sum(axis=1)>0.]
# Write data of single Bone Marrow donor
print('Recording data of %s'%(donorID), dfDonorID.shape)
dfDonorID.to_hdf(os.path.join(extractPath, 'dfDonorID %s'%(donorID) + '.h5'), key=donorID, mode='a', complevel=4, complib='zlib')
return donorIDs
[docs]def getDataframeByDonorID(extractPath, donorID):
'''Get pandas.DataFrame by Donor ID
Parameters:
extractPath: str
Path of directories where HCA matrix files were downloaded and extracted.
See function getHCAdataByURL() for detail.
donorID: str
Donor ID.
Returns:
pandas.DataFrame
Matrix corresponding to the Donor ID
Usage:
getDataframeByDonorID(extractPath, donorID)
'''
fileName = 'dfDonorID %s'%(donorID) + '.h5'
fullFileName = os.path.join(extractPath, fileName)
if os.path.isfile(fullFileName):
print('Reading compressed DataFrame at:', fullFileName)
dfDonorID = pd.read_hdf(fullFileName, key=donorID, mode='r')
else:
print('File %s not found'%(fullFileName))
return
return dfDonorID
[docs]def PrepareDataOnePatient_PREVIEW_DATASET(filename, patient, saveFolderName,
useAllData=True, cellsLimitToUse=1000, randomlySample=True, randomSeed=0):
'''Prepare data from Human Cell Atlas (HCA) preview dataset h5 data file.
The user can download the file ica_bone_marrow_h5.h5 from https://preview.data.humancellatlas.org/
(Raw Counts Matrix - Bone Marrow) and place in folder data.
The file is ~485Mb and contains all 378000 cells from 8 bone marrow donors (BM1-BM8).
Note: this data file is no longer available at HCA data server, however, some users may have a
copy of it and need to extract data from it.
Parameters:
filename: str
Path and name of the file to store binary data in
patient: str
Identifier of the patient: 'BM1', 'BM2', 'BM3', 'BM4', 'BM5', 'BM6', 'BM7' or 'BM8'
saveFolderName: str
Path where to save prepared data file
useAllData: boolean, Default True
Whether to use all data or a subset
cellsLimitToUse: int, Default 1000
Number of cells to use if useAllData=False
randomlySample: boolean, Default True
Whether to sample cell randomly of pick top number
randomSeed: int, Default 0
Random seed
Returns:
None
Usage:
PrepareDataOnePatient(os.path.join('data', 'ica_bone_marrow_h5.h5'), 'BM1', os.path.join('data', ''), useAllData=False, cellsLimitToUse=5000)
'''
print('------ Reading values of %s --------------------------------------------'%(patient))
#Open and Prepare data file
h5File = h5py.File(filename, 'r')
barcodes = np.array(list(map(lambda s: s.decode('UTF-8'), copy.deepcopy(h5File['GRCh38/barcodes'][()]))))
gene_names = np.array(list(map(lambda s: s.decode('UTF-8'), copy.deepcopy(h5File['GRCh38/gene_names'][()]))))
indptr = copy.deepcopy(h5File['GRCh38/indptr'][()])
patients = np.array([cell[6:9] for cell in barcodes])
patient_cells = np.where(patients==patient)[0]
if not useAllData:
np.random.seed(randomSeed)
patient_cells = np.random.choice(patient_cells, replace=False, size=cellsLimitToUse) if randomlySample else patient_cells[:cellsLimitToUse]
columns = pd.MultiIndex.from_arrays([patients[patient_cells], barcodes[patient_cells]], names=['batch', 'cell'])
df = pd.DataFrame(data=np.zeros((len(gene_names), len(patient_cells))), index=gene_names, columns=columns)
print('Combining cells into DataFrame. Data size: %s genes, %s cells' % df.shape)
for cell, index in enumerate(patient_cells):
df.iloc[copy.deepcopy(h5File['GRCh38/indices'][indptr[index]:indptr[index + 1]]), cell] = copy.deepcopy(h5File['GRCh38/data'][indptr[index]:indptr[index + 1]])
print('Compressing %s to Pandas HDF'%(patient))
df.to_hdf(os.path.join(saveFolderName, '%s.h5'%(patient)), key=patient, mode='a', complevel=4, complib='zlib')
print('-------------------------------------------------------------------------\n')
return None
[docs]def prepareDemo5kData(dir):
# http://cf.10xgenomics.com/samples/cell-exp/3.1.0/manual_5k_pbmc_NGSC3_ch1/manual_5k_pbmc_NGSC3_ch1_filtered_feature_bc_matrix.tar.gz
targetFile = os.path.join(dir, 'manual_5k_pbmc_NGSC3_ch1.gz')
if not os.path.isfile(targetFile):
try:
df_data = pd.read_csv(os.path.join(dir, 'matrix.mtx'), delimiter=' ', index_col=[0,1], header=None, skiprows=[0,1,2])[2]
df_data.name = 'count'
df_data = df_data.unstack(fill_value=0).astype(int)
genes = pd.read_csv(os.path.join(dir, 'features.tsv'), delimiter='\t', index_col=None, header=None)[1]
genes.index = genes.index + 1
df_data.index = pd.Index(genes.loc[df_data.index].values, name='gene')
df_data.columns = pd.read_csv(os.path.join(dir, 'barcodes.tsv'), delimiter='\t', index_col=0, header=None).index
df_data.columns.names = ['cell']
df_data = df_data.groupby(axis=0, level=0).sum()
df_data.to_pickle(targetFile, protocol=4)
except Exception as exception:
print(exception)
else:
df_data = pd.read_pickle(targetFile)
print(df_data)
return df_data