Source code for singler._mock_data

import math
import numpy
import biocframe
import summarizedexperiment


[docs] def mock_reference_data( num_labels: int = 5, num_replicates: int = 4, num_genes: int = 1000, prop_markers: float = 0.5 ) -> summarizedexperiment.SummarizedExperiment: """ Mock up some reference data for the various examples in the **singler** package. The simulated data is very simple and should not be used for performance comparisons. Args: num_labels: Number of labels. num_replicates: Number of replicates per label. num_genes: Number of genes in the dataset. prop_markers: Proportion of genes that are markers, i.e., DE between labels. Returns: A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a count matrix and per-column labels. Examples: >>> import singler >>> ref = singler.mock_reference_data() >>> print(ref) """ num_markers = num_genes * prop_markers markers_per_label = math.ceil(num_markers / num_labels) LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" means = numpy.zeros((num_genes, num_labels), dtype=numpy.dtype("double")) mat = numpy.zeros((num_genes, num_labels * num_replicates), dtype=numpy.dtype("double")) labels = [] for i in range(num_labels): curmeans = numpy.zeros(num_genes, dtype=numpy.dtype("double")) start_marker = markers_per_label * i curmeans[start_marker:(start_marker + markers_per_label)] = numpy.abs(numpy.random.normal(size=markers_per_label)) means[:,i] = curmeans for r in range(num_replicates): mat[:,i * num_replicates + r] = numpy.random.poisson(lam=10 * 2**curmeans, size=num_genes) labels += [LETTERS[i]] * num_replicates return summarizedexperiment.SummarizedExperiment( { "counts": mat }, column_data = biocframe.BiocFrame({ "label": labels }), row_names = ["GENE_" + str(i) for i in range(num_genes)], metadata = { "means": means, "labels": list(LETTERS[:num_labels]) } )
[docs] def mock_test_data( mock_ref: summarizedexperiment.SummarizedExperiment, num_cells: int = 100 ) -> summarizedexperiment.SummarizedExperiment: """ Mock up some test data for the various examples in the **singler** package. The simulated data is very simple and should not be used for performance comparisons. Args: mock_ref: Mock reference data generated by :py:func:`~mock_reference_data`. num_cells: Number of cells for which to generate test data. Returns: A :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing a count matrix and per-cell labels. Examples: >>> import singler >>> ref = singler.mock_reference_data() >>> test = singler.mock_test_data(ref) >>> print(test) """ refmeans = mock_ref.get_metadata()["means"] reflabels = mock_ref.get_metadata()["labels"] num_genes = refmeans.shape[0] num_labels = refmeans.shape[1] testmat = numpy.zeros((num_genes, num_cells), dtype=numpy.dtype("double")) testlabels = [] for i in range(num_cells): chosen = numpy.random.randint(low=0, high=num_labels) testlabels.append(reflabels[chosen]) testmat[:,i] = numpy.random.poisson(lam=2 ** refmeans[:,chosen], size=num_genes) # deliberately omit the 10x from the reference, to make life interesting. return summarizedexperiment.SummarizedExperiment( { "counts": testmat }, column_data = biocframe.BiocFrame({ "label": testlabels }), row_names = mock_ref.get_row_names() )