Cell type classification using the SingleR algorithm in C++. More...

Classes
struct	ChooseClassicMarkersOptions
	Options for `choose_classic_markers()`. More...

struct	ClassifyIntegratedBuffers
	Output buffers for `classify_single()`. More...

struct	ClassifyIntegratedOptions
	Options for `classify_integrated()`. More...

struct	ClassifyIntegratedResults
	Results of `classify_integrated()`. More...

struct	ClassifySingleBuffers
	Output buffers for `classify_single()`. More...

struct	ClassifySingleOptions
	Options for `classify_single()` and friends. More...

struct	ClassifySingleResults
	Results of `classify_single()` and `classify_single_intersect()`. More...

class	TrainedIntegrated
	Classifier that integrates multiple reference datasets. More...

class	TrainedSingle
	Classifier trained from a single reference. More...

class	TrainedSingleIntersect
	Classifier built from an intersection of genes. More...

struct	TrainIntegratedInput
	Input to `train_integrated()`. More...

struct	TrainIntegratedOptions
	Options for `train_integrated()`. More...

struct	TrainSingleOptions
	Options for `train_single()` and friends. More...

Typedefs
template<typename Index_ = DefaultIndex>
using	Markers = std::vector<std::vector<std::vector<Index_> > >

template<typename Index_ = DefaultIndex>
using	Intersection = std::vector<std::pair<Index_, Index_> >

typedef int	DefaultIndex

typedef int	DefaultLabel

typedef int	DefaultRefLabel

typedef double	DefaultFloat

typedef double	DefaultValue

Functions
template<typename Value_ , typename Index_ , typename Float_ , typename Label_ >
void	classify_single (const tatami::Matrix< Value_, Index_ > &test, const TrainedSingle< Index_, Float_ > &trained, const ClassifySingleBuffers< Label_, Float_ > &buffers, const ClassifySingleOptions< Float_ > &options)
	Implements the SingleR algorithm for automated annotation of single-cell RNA-seq data.

template<typename Value_ , typename Index_ , typename Float_ , typename Label_ >
void	classify_single_intersect (const tatami::Matrix< Value_, Index_ > &test, const TrainedSingleIntersect< Index_, Float_ > &trained, const ClassifySingleBuffers< Label_, Float_ > &buffers, const ClassifySingleOptions< Float_ > &options)

template<typename Label_ = DefaultLabel, typename Value_ , typename Index_ , typename Float_ >
ClassifySingleResults< Label_, Float_ >	classify_single (const tatami::Matrix< Value_, Index_ > &test, const TrainedSingle< Index_, Float_ > &trained, const ClassifySingleOptions< Float_ > &options)

template<typename Label_ = DefaultLabel, typename Value_ , typename Index_ , typename Float_ >
ClassifySingleResults< Label_, Float_ >	classify_single_intersect (const tatami::Matrix< Value_, Index_ > &test, const TrainedSingleIntersect< Index_, Float_ > &trained, const ClassifySingleOptions< Float_ > &options)

template<typename Value_ , typename Index_ , typename Label_ , typename Float_ , class Matrix_ >
TrainedSingle< Index_, Float_ >	train_single (const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, Markers< Index_ > markers, const TrainSingleOptions< Index_, Float_, Matrix_ > &options)

template<typename Index_ , typename Value_ , typename Label_ , typename Float_ >
TrainedSingleIntersect< Index_, Float_ >	train_single_intersect (Index_ test_nrow, const Intersection< Index_ > &intersection, const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, Markers< Index_ > markers, const TrainSingleOptions< Index_, Float_ > &options)

template<typename Index_ , typename Id_ , typename Value_ , typename Label_ , typename Float_ , class Matrix_ >
TrainedSingleIntersect< Index_, Float_ >	train_single_intersect (Index_ test_nrow, const Id_ test_id, const tatami::Matrix< Value_, Index_ > &ref, const Id_ ref_id, const Label_ *labels, Markers< Index_ > markers, const TrainSingleOptions< Index_, Float_, Matrix_ > &options)

template<typename Value_ , typename Index_ , typename Label_ , typename RefLabel_ , typename Float_ >
void	classify_integrated (const tatami::Matrix< Value_, Index_ > &test, const std::vector< const Label_ * > &assigned, const TrainedIntegrated< Index_ > &trained, ClassifyIntegratedBuffers< RefLabel_, Float_ > &buffers, const ClassifyIntegratedOptions< Float_ > &options)
	Integrate classifications from multiple references.

template<typename RefLabel_ = DefaultRefLabel, typename Value_ , typename Index_ , typename Label_ , typename Float_ >
ClassifyIntegratedResults< RefLabel_, Float_ >	classify_integrated (const tatami::Matrix< Value_, Index_ > &test, const std::vector< const Label_ * > &assigned, const TrainedIntegrated< Index_ > &trained, const ClassifyIntegratedOptions< Float_ > &options)

template<typename Value_ , typename Index_ , typename Label_ , typename Float_ >
TrainIntegratedInput< Value_, Index_, Label_ >	prepare_integrated_input (const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingle< Index_, Float_ > &trained)

template<typename Index_ , typename Value_ , typename Label_ , typename Float_ >
TrainIntegratedInput< Value_, Index_, Label_ >	prepare_integrated_input_intersect (Index_ test_nrow, const Intersection< Index_ > &intersection, const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingleIntersect< Index_, Float_ > &trained)

template<typename Index_ , typename Id_ , typename Value_ , typename Label_ , typename Float_ >
TrainIntegratedInput< Value_, Index_, Label_ >	prepare_integrated_input_intersect (Index_ test_nrow, const Id_ test_id, const tatami::Matrix< Value_, Index_ > &ref, const Id_ ref_id, const Label_ *labels, const TrainedSingleIntersect< Index_, Float_ > &trained)

template<typename Value_ , typename Index_ , typename Label_ >
TrainedIntegrated< Index_ >	train_integrated (const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &inputs, const TrainIntegratedOptions &options)

template<typename Value_ , typename Index_ , typename Label_ >
TrainedIntegrated< Index_ >	train_integrated (std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &&inputs, const TrainIntegratedOptions &options)

template<typename Index_ , typename Id_ >
Intersection< Index_ >	intersect_genes (Index_ test_nrow, const Id_ test_id, Index_ ref_nrow, const Id_ ref_id)

std::size_t	number_of_classic_markers (std::size_t num_labels)

template<typename Value_ , typename Index_ , typename Label_ >
Markers< Index_ >	choose_classic_markers (const std::vector< const tatami::Matrix< Value_, Index_ > * > &representatives, const std::vector< const Label_ * > &labels, const ChooseClassicMarkersOptions &options)

template<typename Value_ , typename Index_ , typename Label_ >
Markers< Index_ >	choose_classic_markers (const tatami::Matrix< Value_, Index_ > &representative, const Label_ *labels, const ChooseClassicMarkersOptions &options)

Detailed Description

Cell type classification using the SingleR algorithm in C++.

Typedef Documentation

◆ DefaultFloat

typedef double singlepp::DefaultFloat

Default type for the Float_ template argument. This is the type for the correlations and classification scores.

◆ DefaultIndex

typedef int singlepp::DefaultIndex

Default type for the Index_ template argument. This is the type for the gene (and sample) indices, typically from the row/column indices of a tatami::Matrix.

◆ DefaultLabel

typedef int singlepp::DefaultLabel

Default type for the Label_ template argument. This is the type for the label identifiers within each reference.

◆ DefaultRefLabel

typedef int singlepp::DefaultRefLabel

Default type for the RefLabel_ template argument. This is the type for the reference identifiers during integrated classification.

◆ DefaultValue

typedef double singlepp::DefaultValue

Default type for the Value_ template argument. This is the type for input data in the tatami::Matrix.

◆ Intersection

template<typename Index_ = DefaultIndex>

using singlepp::Intersection = std::vector<std::pair<Index_, Index_> >

Intersection of genes between two datasets. Each pair represents a gene that is present in both datasets. The two elements of the pair represent the row indices of that gene in the respective matrices.

Typically, the first element of the pair contains the row index of a gene in the test dataset, while the second element of the pair contains the row index of the same gene in the reference dataset. This convention is used by intersect_genes(), train_single_intersect() and prepare_integrated_input_intersect().

A row index for a matrix should occur no more than once in the Intersection object. That is, all the first elements should be unique and all of the second elements should be unique. Pairs may be arbitrarily ordered within the object.

Template Parameters

Index_ Integer type for the gene (row) indices.

◆ Markers

template<typename Index_ = DefaultIndex>

using singlepp::Markers = std::vector<std::vector<std::vector<Index_> > >

A vector of vectors of marker lists, with one list for each pairwise comparison between labels in the reference dataset. This is used to determine which genes should be used to compute correlations in train_single() and train_single_intersect().

For a Markers object markers, let us consider the vector at markers[0][1]. This vector should contain a list of marker genes for label 0 compared to label 1. Each gene is represented as the row index of the reference expression matrix, i.e., ref in train_single() and train_single_intersect(). The vector should also be sorted by the "strength" of the markers such that the earliest entries are the strongest markers for that pairwise comparison. Typically, this vector is created by identifying the genes that are upregulated in label 0 compared to 1 and sorting by decreasing effect size. So, for example, markers[0][1][0] should contain the row index of the most upregulated gene in this comparison.

For a given reference dataset, the corresponding Markers object should have length equal to the number of labels in that reference. Each middle vector (i.e., markers[i] for non-negative i less than the number of labels) should also have length equal to the number of labels. Any innermost vector along the "diagonal" (i.e., markers[i][i]) is typically of zero length. The innermost vectors that are not on the diagonal (i.e., markers[i][j] for i != j) may be of any positive length and should contain unique row indices. Note that the length of all innermost vectors will be be capped by any non-negative TrainSingleOptions::top in train_single() and friends.

As mentioned above, the diagonal innermost vectors are typically empty, given that it makes little sense to identify upregulated markers in a label compared to itself. That said, any genes stored on the diagonal will be respected and used in all gene subsets for the corresponding label. This can be exploited by advanced users to efficiently store "universal" markers for a label, i.e., markers that are applicable in all comparisons to other labels.

Template Parameters

Index_ Integer type for the gene (row) indices.

Function Documentation

◆ choose_classic_markers() [1/2]

template<typename Value_ , typename Index_ , typename Label_ >

Markers< Index_ > singlepp::choose_classic_markers	(	const std::vector< const tatami::Matrix< Value_, Index_ > * > &	representatives,
		const std::vector< const Label_ * > &	labels,
		const ChooseClassicMarkersOptions &	options )

Overload of choose_classic_markers() that handles multiple references. When choosing markers for label \(A\) against \(B\), we only consider those references with both labels \(A\) and \(B\). For each gene, we compute the log-fold change within each reference, and then sum the log-fold changes across references; the ordering of the top genes is then performed using this sum. It is assumed that all references have the same number and ordering of features in their rows.

Template Parameters

Value_	Numeric type of matrix values.
Index_	Integer type of matrix row/column indices.
Label_	Integer type for the label identity.

Parameters

representatives	Vector of pointers to representative tatami matrices. Each matrix should contain no more than one column per label. Each column should contain a "representative" log-expression profile for that label, e.g., using the per-gene median expression across all samples assigned to that label. All matrices should have the same number of rows, corresponding to the same features.
labels	Vector of pointers of length equal to `representatives`. Each array should be of length equal to the number of columns of the corresponding entry of `representatives`. Each value of the array should specify the label for the corresponding column in its matrix. Values should lie in \([0, L)\) for \(L\) unique labels across all entries of `labels`.
options	Further options.

Returns: Top markers for each pairwise comparison between labels.

◆ choose_classic_markers() [2/2]

template<typename Value_ , typename Index_ , typename Label_ >

Markers< Index_ > singlepp::choose_classic_markers	(	const tatami::Matrix< Value_, Index_ > &	representative,
		const Label_ *	labels,
		const ChooseClassicMarkersOptions &	options )

Implements the classic SingleR method for choosing markers from (typically bulk) reference datasets. We assume that we have a matrix of representative expression profiles for each label, typically computed by averaging across all reference profiles for that label. For the comparison between labels \(A\) and \(B\), we define the marker set as the top genes with the largest positive differences in \(A\)'s profile over \(B\). This difference can be interpreted as the log-fold change if the input matrix contains log-expression values. If multiple genes have the same difference, ties are broken by favoring genes in earlier rows of the input matrix. The number of top genes can either be explicitly specified or it can be automatically determined from the number of labels.

Template Parameters

Value_	Numeric type of matrix values.
Index_	Integer type of matrix row/column indices.
Label_	Integer type for the label identity.

Parameters

representative	A representative matrix, containing one column per label. Each column should have a representative log-expression profile for that label.
labels	Pointer to an array of length equal to the number of columns in `representative`. Each value of the array should specify the label for the corresponding column. Values should lie in \([0, L)\) for \(L\) unique labels.
options	Further options.

Returns: Top markers for each pairwise comparison between labels.

◆ classify_integrated() [1/2]

template<typename Value_ , typename Index_ , typename Label_ , typename RefLabel_ , typename Float_ >

void singlepp::classify_integrated	(	const tatami::Matrix< Value_, Index_ > &	test,
		const std::vector< const Label_ * > &	assigned,
		const TrainedIntegrated< Index_ > &	trained,
		ClassifyIntegratedBuffers< RefLabel_, Float_ > &	buffers,
		const ClassifyIntegratedOptions< Float_ > &	options )

Integrate classifications from multiple references.

When multiple reference datasets are available, we would like to obtain a single prediction for each cell from all of those references. This is somewhat tricky as different references tend to have inconsistent labels, e.g., different vocabularies and cell subtype resolutions, making it difficult to define sensible groups in a combined "super-reference". Strong batch effects are also likely to exist between different references, complicating the choice of marker genes when comparing between labels of different references.

To avoid these issues, we first perform classification within each individual reference using, e.g., classify_single(). For each test cell, we collect all the marker genes for that cell's predicted label in each reference. We pool all of these collected markers to obtain a common set of interesting genes. Using this common set of genes, we compute the usual correlation-based score between the test cell's expression profile and its predicted label from each reference, along with some fine-tuning iterations to improve resolution between similar labels. The label with the highest score is considered the best representative across all references.

This method is similar to the algorithm described in classify_single(), except that we are choosing between the best labels from all references rather than between all labels from one reference. The creation of a common gene set ensures that the correlations can be reasonably compared across references. (Note that differences in the gene sets across references are tolerated by simply ignoring missing genes when computing the correlations. This reduces the comparability of the scores as the actual genes used for each reference will vary; nonetheless, it is preferred to taking the intersection, which is liable to leave us with very few genes.)

Our approach avoids any direct comparison between the expression profiles of different references, allowing us to side-step the question of how to deal with the batch effects. Similarly, we defer responsibility on solving the issue of label heterogeneity, by just passing along the existing labels and leaving it to the user's interpretation.

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices.
Label_	Integer type for the labels within each reference.
RefLabel_	Integer type for the label to represent each reference.
Float_	Floating-point type for the correlations and scores.

Parameters

	test	Expression matrix of the test dataset, where rows are genes and columns are cells. The identity of the rows should be consistent with the reference datasets used to construct `trained`, see `prepare_integrated_input()` and `prepare_integrated_input_intersect()` for details.
[in]	assigned	Vector of pointers of length equal to the number of references. Each pointer should point to an array of length equal to the number of columns in `test`, containing the assigned label for each column in each reference.
	trained	The integrated classifier returned by `train_integrated()`.
[out]	buffers	Buffers in which to store the classification output.
	options	Further options.

◆ classify_integrated() [2/2]

template<typename RefLabel_ = DefaultRefLabel, typename Value_ , typename Index_ , typename Label_ , typename Float_ >

ClassifyIntegratedResults< RefLabel_, Float_ > singlepp::classify_integrated	(	const tatami::Matrix< Value_, Index_ > &	test,
		const std::vector< const Label_ * > &	assigned,
		const TrainedIntegrated< Index_ > &	trained,
		const ClassifyIntegratedOptions< Float_ > &	options )

Overload of classify_integrated() that allocates space for the results.

Parameters

	test	Expression matrix of the test dataset, where rows are genes and columns are cells. The identity of the rows should be consistent with the reference datasets used to construct `trained`, see `prepare_integrated_input()` and `prepare_integrated_input_intersect()` for details.
[in]	assigned	Vector of pointers of length equal to the number of references. Each pointer should point to an array of length equal to the number of columns in `mat`, containing the assigned label for each column in each reference.
	trained	A pre-built classifier produced by `train_integrated()`.
	options	Further options.

Returns: Object containing the best reference and associated scores for each cell in test.

◆ classify_single() [1/2]

template<typename Value_ , typename Index_ , typename Float_ , typename Label_ >

void singlepp::classify_single	(	const tatami::Matrix< Value_, Index_ > &	test,
		const TrainedSingle< Index_, Float_ > &	trained,
		const ClassifySingleBuffers< Label_, Float_ > &	buffers,
		const ClassifySingleOptions< Float_ > &	options )

Implements the SingleR algorithm for automated annotation of single-cell RNA-seq data.

For each cell, we compute the Spearman rank correlation between that cell and the reference expression profiles. This is done using only the subset of genes that are label-specific markers, most typically the top genes from pairwise comparisons between each label's expression profiles (see choose_classic_markers() for an example). For each label, we take the correlations involving that label's reference profiles and convert it into a score. The label with the highest score is used as an initial label for that cell.

Next, we apply fine-tuning iterations to improve the label accuracy for each cell by refining the feature space. We find the subset of labels with scores that are "close enough" to the maximum score according to some threshold. We recompute the scores using only the markers for this subset of labels, and we repeat the process until only one label is left or the subset of labels no longer changes. At the end of the iterations, the label with the highest score (or the only label, if just one is left) is used as the label for the cell. This process aims to remove noise by eliminating irrelevant genes when attempting to distinguish closely related labels.

Each label's score is defined as a user-specified quantile of the distribution of correlations across all reference profiles assigned to that label. Larger quantiles focus on similarity between the test cell and the closest profiles for a label, which is useful for broad labels with heterogeneous profiles. Smaller quantiles require the test cell to be similar to the majority of profiles for a label. The use of a quantile ensures that the score adjusts to the number of reference profiles per label; otherwise, just using the "top X correlations" would implicitly favor labels with more reference profiles.

The choice of Spearman's correlation provides some robustness against batch effects when comparing reference and test datasets. Only the relative expression within each cell needs to be comparable, not their relative expression across cells. As a result, it does not matter whether raw counts are supplied or log-transformed expression values, as the latter is a monotonic transformation of the latter (within each cell). The algorithm is also robust to differences in technologies between reference and test profiles, though it is preferable to have like-for-like comparisons.

See also: Aran D et al. (2019). Reference-based analysis of lung single-cell sequencing reveals a transitional profibrotic macrophage. Nat. Immunol. 20, 163-172

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices.
Float_	Floating-point type for the correlations and scores.
Label_	Integer type for the reference labels.

Parameters

	test	Expression matrix of the test dataset, where rows are genes and columns are cells. This should have the same order and identity of genes as the reference matrix used to create `trained`.
	trained	Classifier returned by `train_single()`.
[out]	buffers	Buffers in which to store the classification output. Each non-`NULL` pointer should refer to an array of length equal to the number of columns in `test`.
	options	Further options.

◆ classify_single() [2/2]

template<typename Label_ = DefaultLabel, typename Value_ , typename Index_ , typename Float_ >

ClassifySingleResults< Label_, Float_ > singlepp::classify_single	(	const tatami::Matrix< Value_, Index_ > &	test,
		const TrainedSingle< Index_, Float_ > &	trained,
		const ClassifySingleOptions< Float_ > &	options )

Overload of classify_single() that allocates space for the output statistics.

Template Parameters

Label_	Integer type for the reference labels.
Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices.
Float_	Floating-point type for the correlations and scores.

Parameters

test	Expression matrix of the test dataset, where rows are genes and columns are cells. This should have the same order and identity of genes as the reference matrix used to create `trained`.
trained	Classifier returned by `train_single()`.
options	Further options.

Returns: Results of the classification for each cell in the test dataset.

◆ classify_single_intersect() [1/2]

template<typename Value_ , typename Index_ , typename Float_ , typename Label_ >

void singlepp::classify_single_intersect	(	const tatami::Matrix< Value_, Index_ > &	test,
		const TrainedSingleIntersect< Index_, Float_ > &	trained,
		const ClassifySingleBuffers< Label_, Float_ > &	buffers,
		const ClassifySingleOptions< Float_ > &	options )

Variant of classify_single() that should be used when the test and reference datasets do not have the same number/order of genes. This is done by limiting the classification to the intersection of genes between the two datasets.

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.

Parameters

	test	Expression matrix of the test dataset, where rows are genes and columns are cells. This should have the same order and identity of genes as the `test_nrow` and `test_id` used to create `trained`.
	trained	Classifier returned by `train_single_intersect()`.
[out]	buffers	Buffers in which to store the classification output. Each non-`NULL` pointer should refer to an array of length equal to the number of columns in `test`.
	options	Further options.

◆ classify_single_intersect() [2/2]

template<typename Label_ = DefaultLabel, typename Value_ , typename Index_ , typename Float_ >

ClassifySingleResults< Label_, Float_ > singlepp::classify_single_intersect	(	const tatami::Matrix< Value_, Index_ > &	test,
		const TrainedSingleIntersect< Index_, Float_ > &	trained,
		const ClassifySingleOptions< Float_ > &	options )

Overload of classify_single_intersect() that allocates space for the output statistics.

Template Parameters

Label_	Integer type for the reference labels.
Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices.
Float_	Floating-point type for the correlations and scores.

Parameters

test	Expression matrix of the test dataset, where rows are genes and columns are cells. This should have the same order and identity of genes as the `test_nrow` and `test_ids` used to create `trained`.
trained	Classifier returned by `train_single_intersect()`.
options	Further options.

Returns: Results of the classification for each cell in the test dataset.

◆ intersect_genes()

template<typename Index_ , typename Id_ >

Intersection< Index_ > singlepp::intersect_genes	(	Index_	test_nrow,
		const Id_ *	test_id,
		Index_	ref_nrow,
		const Id_ *	ref_id )

Compute the intersection of genes in the test and reference datasets.

Template Parameters

Index_	Integer type for the row indices of genes in each dataset. Also used as the type for the number of genes.
Id_	Type of the gene identifier, typically an integer or string.

Parameters

	test_nrow	Number of genes (i.e., rows) in the test dataset.
[in]	test_id	Pointer to an array of length `test_nrow`, containing the gene identifiers for each row in the test dataset.
	ref_nrow	Number of genes (i.e., rows) in the reference dataset.
[in]	ref_id	Pointer to an array of length `ref_nrow`, containing the gene identifiers for each row in the reference dataset.

Returns: Intersection of genes between the two datasets. The first entry of each pair contains the row index in the test dataset while the second entry contains the row index in the reference. If duplicated identifiers are present in either of test_id or ref_id, only the first occurrence is considered in the intersection.

◆ number_of_classic_markers()

std::size_t singlepp::number_of_classic_markers ( std::size_t num_labels )

inline

Choose the number of markers in choose_classic_markers(). The exact expression is defined as \(500 (\frac{2}{3})^{\log_2{L}}\) for \(L\) labels, which steadily decreases the markers per comparison as the number of labels increases. This aims to avoid an excessive number of features when dealing with references with many labels.

Parameters

num_labels Number of labels in the reference(s).

Returns: An appropriate number of markers for each pairwise comparison.

◆ prepare_integrated_input()

template<typename Value_ , typename Index_ , typename Label_ , typename Float_ >

TrainIntegratedInput< Value_, Index_, Label_ > singlepp::prepare_integrated_input	(	const tatami::Matrix< Value_, Index_ > &	ref,
		const Label_ *	labels,
		const TrainedSingle< Index_, Float_ > &	trained )

Prepare a reference dataset for train_integrated(). This overload assumes that the reference and test datasets have the same genes. ref and labels are expected to remain valid until train_integrated() is called.

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices of the matrix.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.

Parameters

	ref	Matrix containing the reference expression values, where rows are genes and columns are reference profiles. The number and identity of genes should be identical to the test dataset to be classified in `classify_integrated()`.
[in]	labels	Pointer to an array of label assignments. Values should be integers in \([0, L)\) where \(L\) is the number of unique labels.
	trained	Classifier created by calling `train_single()` on `ref` and `labels`.

Returns: An opaque input object for train_integrated().

◆ prepare_integrated_input_intersect() [1/2]

template<typename Index_ , typename Id_ , typename Value_ , typename Label_ , typename Float_ >

TrainIntegratedInput< Value_, Index_, Label_ > singlepp::prepare_integrated_input_intersect	(	Index_	test_nrow,
		const Id_ *	test_id,
		const tatami::Matrix< Value_, Index_ > &	ref,
		const Id_ *	ref_id,
		const Label_ *	labels,
		const TrainedSingleIntersect< Index_, Float_ > &	trained )

Prepare a reference dataset for train_integrated(). This overload automatically identifies the intersection of genes between the test and reference datasets. ref and labels are expected to remain valid until train_integrated() is called.

Template Parameters

Index_	Integer type for the row/column indices of the matrix.
Id_	Type of the gene identifier.
Value_	Numeric type for the matrix values.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.

Parameters

	test_nrow	Number of rows (i.e., genes) in the test dataset.
[in]	test_id	Pointer to an array of length equal to `test_nrow`. This should contain a unique identifier for each row of `mat`, typically a gene name or index. If any duplicate IDs are present, only the first occurrence is used.
	ref	An expression matrix for the reference expression profiles, where rows are genes and columns are cells. This should have non-zero columns.
[in]	ref_id	Pointer to an array equal to the number of rows of `ref`. This should contain a unique identifier for each row in `ref`, comparable to those in `test_id`. If any duplicate IDs are present, only the first occurrence is used.
[in]	labels	An array of length equal to the number of columns of `ref`, containing the label for each sample. Values should be integers in \([0, L)\) where \(L\) is the number of unique labels.
	trained	Classifier created by calling `train_single_intersect()` on `test_nrow`, `test_id`, `ref`, `ref_id` and `labels`.

Returns: An opaque input object for train_integrated().

◆ prepare_integrated_input_intersect() [2/2]

template<typename Index_ , typename Value_ , typename Label_ , typename Float_ >

TrainIntegratedInput< Value_, Index_, Label_ > singlepp::prepare_integrated_input_intersect	(	Index_	test_nrow,
		const Intersection< Index_ > &	intersection,
		const tatami::Matrix< Value_, Index_ > &	ref,
		const Label_ *	labels,
		const TrainedSingleIntersect< Index_, Float_ > &	trained )

Prepare a reference dataset for train_integrated(). This overload requires an existing intersection between the test and reference datasets. intersection, ref and labels are expected to remain valid until train_integrated() is called.

Template Parameters

Index_	Integer type for the row/column indices of the matrix.
Value_	Numeric type for the matrix values.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.

Parameters

	test_nrow	Number of features in the test dataset.
	intersection	Vector defining the intersection of genes between the test and reference datasets. Each pair corresponds to a gene where the first and second elements represent the row indices of that gene in the test and reference matrices, respectively. The first element of each pair should be non-negative and less than `test_nrow`, while the second element should be non-negative and less than `ref->nrow()`. See `intersect_genes()` for more details.
	ref	Matrix containing the reference expression values, where rows are genes and columns are reference profiles. The number and identity of genes should be consistent with `intersection`.
[in]	labels	An array of length equal to the number of columns of `ref`, containing the label for each sample. Values should be integers in \([0, L)\) where \(L\) is the number of unique labels.
	trained	Classifier created by calling `train_single_intersect()` on `test_nrow`, `intersection`, `ref` and `labels`.

Returns: An opaque input object for train_integrated().

◆ train_integrated() [1/2]

template<typename Value_ , typename Index_ , typename Label_ >

TrainedIntegrated< Index_ > singlepp::train_integrated	(	const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &	inputs,
		const TrainIntegratedOptions &	options )

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices of the matrix.
Label_	Integer type for the reference labels.

Parameters

inputs	Vector of references, typically constructed with `prepare_integrated_input()` or `prepare_integrated_input_intersect()`.
options	Further options.

Returns: A pre-built classifier that integrates multiple references, for use in classify_integrated().

◆ train_integrated() [2/2]

template<typename Value_ , typename Index_ , typename Label_ >

TrainedIntegrated< Index_ > singlepp::train_integrated	(	std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &&	inputs,
		const TrainIntegratedOptions &	options )

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices of the matrix.
Label_	Integer type for the reference labels.

Parameters

inputs	Vector of references, typically constructed with `prepare_integrated_input()` or `prepare_integrated_input_intersect()`.
options	Further options.

Returns: A pre-built classifier that integrates multiple references, for use in classify_integrated().

◆ train_single()

template<typename Value_ , typename Index_ , typename Label_ , typename Float_ , class Matrix_ >

TrainedSingle< Index_, Float_ > singlepp::train_single	(	const tatami::Matrix< Value_, Index_ > &	ref,
		const Label_ *	labels,
		Markers< Index_ >	markers,
		const TrainSingleOptions< Index_, Float_, Matrix_ > &	options )

Prepare a single labelled reference dataset for use in classify_single(). This involves pre-ranking the markers based on their expression in each reference profile, so that the Spearman correlations can be computed without repeated sorting. We also construct neighbor search indices for rapid calculation of the classification score.

The classifier returned by this function should only be used in classify_single() with a test dataset that has the same genes as the reference dataset. If the test dataset has different genes, consider using train_single_intersect() instead.

Template Parameters

Value_	Numeric type for the matrix values.
Index_	Integer type for the row/column indices of the matrix.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.
Matrix_	Class of the input data for the neighbor search. This should satisfy the `knncolle::Matrix` interface.

Parameters

	ref	Matrix for the reference expression profiles. Rows are genes while columns are profiles.
[in]	labels	An array of length equal to the number of columns of `ref`, containing the label for each reference profile. Labels should be integers in \([0, L)\) where \(L\) is the total number of unique labels.
	markers	A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `singlepp::Markers` for more details.
	options	Further options.

Returns: A pre-built classifier that can be used in classify_single() with a test dataset.

◆ train_single_intersect() [1/2]

template<typename Index_ , typename Id_ , typename Value_ , typename Label_ , typename Float_ , class Matrix_ >

TrainedSingleIntersect< Index_, Float_ > singlepp::train_single_intersect	(	Index_	test_nrow,
		const Id_ *	test_id,
		const tatami::Matrix< Value_, Index_ > &	ref,
		const Id_ *	ref_id,
		const Label_ *	labels,
		Markers< Index_ >	markers,
		const TrainSingleOptions< Index_, Float_, Matrix_ > &	options )

Variant of train_single() that uses the intersection of genes between the reference dataset and a (future) test dataset. This is useful when the genes are not in the same order and number across the test and reference datasets.

The classifier returned by this function should only be used in classify_single_intersect() with a test dataset that has test_nrow rows with the same order and identity of genes as in test_id.

Template Parameters

Index_	Integer type for the row/column indices of the matrix.
Id_	Type of the gene identifier for each row, typically integer or string.
Value_	Numeric type for the matrix values.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.
Matrix_	Class of the input data for the neighbor search. This should satisfy the `knncolle::Matrix` interface.

Parameters

	test_nrow	Number of rows (genes) in the test dataset.
[in]	test_id	Pointer to an array of length equal to `test_nrow`, containing a gene identifier for each row of the test dataset. If any duplicate IDs are present, only the first occurrence is used.
	ref	An expression matrix for the reference expression profiles, where rows are genes and columns are cells. This should have non-zero columns.
[in]	ref_id	Pointer to an array of length equal to the number of rows of `ref`, containing a gene identifier for each row of the reference dataset. Identifiers should be comparable to those in `test_id`. If any duplicate IDs are present, only the first occurrence is used.
[in]	labels	An array of length equal to the number of columns of `ref`, containing the label for each reference profile. Labels should be integers in \([0, L)\) where \(L\) is the total number of unique labels.
	markers	A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `singlepp::Markers` for more details.
	options	Further options.

Returns: A pre-built classifier that can be used in classify_single_intersect().

◆ train_single_intersect() [2/2]

template<typename Index_ , typename Value_ , typename Label_ , typename Float_ >

TrainedSingleIntersect< Index_, Float_ > singlepp::train_single_intersect	(	Index_	test_nrow,
		const Intersection< Index_ > &	intersection,
		const tatami::Matrix< Value_, Index_ > &	ref,
		const Label_ *	labels,
		Markers< Index_ >	markers,
		const TrainSingleOptions< Index_, Float_ > &	options )

Variant of train_single() that uses a pre-computed intersection of genes between the reference dataset and an as-yet-unspecified test dataset. Most users will prefer to use the other train_single_intersect() overload that accepts test_id and ref_id and computes the intersection automatically.

The classifier returned by this function should only be used in classify_single_intersect() with a test dataset that is compatible with the mappings in intersection. That is, the gene in the intersection[i].first-th row of the test dataset should correspond to the intersection[i].second-th row of the reference dataset.

Template Parameters

Index_	Integer type for the row/column indices of the matrix.
Value_	Numeric type for the matrix values.
Label_	Integer type for the reference labels.
Float_	Floating-point type for the correlations and scores.
Matrix_	Class of the input data for the neighbor search. This should satisfy the `knncolle::Matrix` interface.

Parameters

	test_nrow	Number of features in the test dataset.
	intersection	Vector defining the intersection of genes between the test and reference datasets. Each pair corresponds to a gene where the first and second elements represent the row indices of that gene in the test and reference matrices, respectively. The first element of each pair should be non-negative and less than `test_nrow`, while the second element should be non-negative and less than `ref->nrow()`. See `intersect_genes()` for more details.
	ref	An expression matrix for the reference expression profiles, where rows are genes and columns are cells. This should have non-zero columns.
[in]	labels	An array of length equal to the number of columns of `ref`, containing the label for each reference profile. Labels should be integers in \([0, L)\) where \(L\) is the total number of unique labels.
	markers	A vector of vectors of ranked marker genes for each pairwise comparison between labels, see `singlepp::Markers` for more details.
	options	Further options.

Returns: A pre-built classifier that can be used in classify_single_intersect().

Classes

Typedefs

Functions

Detailed Description

Typedef Documentation

◆ DefaultFloat

◆ DefaultIndex

◆ DefaultLabel

◆ DefaultRefLabel

◆ DefaultValue

◆ Intersection

◆ Markers

Function Documentation

◆ choose_classic_markers() [1/2]

◆ choose_classic_markers() [2/2]

◆ classify_integrated() [1/2]

◆ classify_integrated() [2/2]

◆ classify_single() [1/2]

◆ classify_single() [2/2]

◆ classify_single_intersect() [1/2]

◆ classify_single_intersect() [2/2]

◆ intersect_genes()

◆ number_of_classic_markers()

◆ prepare_integrated_input()

◆ prepare_integrated_input_intersect() [1/2]

◆ prepare_integrated_input_intersect() [2/2]

◆ train_integrated() [1/2]

◆ train_integrated() [2/2]

◆ train_single()

◆ train_single_intersect() [1/2]

◆ train_single_intersect() [2/2]