Source code for singler._mock_data
import math
import numpy
import biocframe
import summarizedexperiment
[docs]
def mock_reference_data(
num_labels: int = 5,
num_replicates: int = 4,
num_genes: int = 1000,
prop_markers: float = 0.5
) -> summarizedexperiment.SummarizedExperiment:
"""
Mock up some reference data for the various examples in the **singler** package.
The simulated data is very simple and should not be used for performance comparisons.
Args:
num_labels:
Number of labels.
num_replicates:
Number of replicates per label.
num_genes:
Number of genes in the dataset.
prop_markers:
Proportion of genes that are markers, i.e., DE between labels.
Returns:
A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a count matrix and per-column labels.
Examples:
>>> import singler
>>> ref = singler.mock_reference_data()
>>> print(ref)
"""
num_markers = num_genes * prop_markers
markers_per_label = math.ceil(num_markers / num_labels)
LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
means = numpy.zeros((num_genes, num_labels), dtype=numpy.dtype("double"))
mat = numpy.zeros((num_genes, num_labels * num_replicates), dtype=numpy.dtype("double"))
labels = []
for i in range(num_labels):
curmeans = numpy.zeros(num_genes, dtype=numpy.dtype("double"))
start_marker = markers_per_label * i
curmeans[start_marker:(start_marker + markers_per_label)] = numpy.abs(numpy.random.normal(size=markers_per_label))
means[:,i] = curmeans
for r in range(num_replicates):
mat[:,i * num_replicates + r] = numpy.random.poisson(lam=10 * 2**curmeans, size=num_genes)
labels += [LETTERS[i]] * num_replicates
return summarizedexperiment.SummarizedExperiment(
{ "counts": mat },
column_data = biocframe.BiocFrame({ "label": labels }),
row_names = ["GENE_" + str(i) for i in range(num_genes)],
metadata = { "means": means, "labels": list(LETTERS[:num_labels]) }
)
[docs]
def mock_test_data(
mock_ref: summarizedexperiment.SummarizedExperiment,
num_cells: int = 100
) -> summarizedexperiment.SummarizedExperiment:
"""
Mock up some test data for the various examples in the **singler** package.
The simulated data is very simple and should not be used for performance comparisons.
Args:
mock_ref:
Mock reference data generated by :py:func:`~mock_reference_data`.
num_cells:
Number of cells for which to generate test data.
Returns:
A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a count matrix and per-cell labels.
Examples:
>>> import singler
>>> ref = singler.mock_reference_data()
>>> test = singler.mock_test_data(ref)
>>> print(test)
"""
refmeans = mock_ref.get_metadata()["means"]
reflabels = mock_ref.get_metadata()["labels"]
num_genes = refmeans.shape[0]
num_labels = refmeans.shape[1]
testmat = numpy.zeros((num_genes, num_cells), dtype=numpy.dtype("double"))
testlabels = []
for i in range(num_cells):
chosen = numpy.random.randint(low=0, high=num_labels)
testlabels.append(reflabels[chosen])
testmat[:,i] = numpy.random.poisson(lam=2 ** refmeans[:,chosen], size=num_genes) # deliberately omit the 10x from the reference, to make life interesting.
return summarizedexperiment.SummarizedExperiment(
{ "counts": testmat },
column_data = biocframe.BiocFrame({ "label": testlabels }),
row_names = mock_ref.get_row_names()
)