Source code for singler._annotate_single

import warnings
from typing import Any, Optional, Sequence, Union

import biocframe
import summarizedexperiment

from ._train_single import train_single 
from ._classify_single import classify_single
from ._utils import _clean_matrix, _restrict_features



[docs]
def annotate_single(
    test_data: Any,
    ref_data: Any,
    ref_labels: Sequence,
    test_features: Optional[Sequence] = None,
    ref_features: Optional[Sequence] = None,
    test_assay_type: Union[str, int] = 0,
    ref_assay_type: Union[str, int] = 0,
    test_check_missing: bool = False,
    ref_check_missing: bool = True,
    train_args: dict = {},
    classify_args: dict = {},
    num_threads: int = 1,
) -> biocframe.BiocFrame:
    """
    Annotate a single-cell expression dataset based on the correlation of each cell to profiles in a labelled reference.

    Args:
        test_data:
            A matrix-like object representing the test dataset, where rows are features and columns are samples (usually cells).
            Entries can be expression values of any kind; only the ranking within each column will be used.

            Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays.

        ref_data:
            A matrix-like object representing the reference dataset, where rows are features and columns are samples.
            Entries should be expression values, usually log-transformed (see comments for the ``ref`` argument in :py:func:`~singler.train_single`).

            Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays.

        ref_labels:
            Sequence of length equal to the number of columns of ``ref_data``, containing the label associated with each column.

        test_features:
            Sequence of length equal to the number of rows in ``test_data``, containing the feature identifier for each row.
            Alternatively ``None``, to use the row names of the experiment as features.

        ref_features:
            Sequence of length equal to the number of rows of ``ref_data``, containing the feature identifier for each row.
            Alternatively ``None``, to use the row names of the experiment as features.

        test_assay_type:
            Assay containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        ref_assay_type:
            Assay containing the expression matrix, if ``ref_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        test_assay_type:
            Whether to remove rows with missing values from the test dataset.

        ref_assay_type:
            Whether to remove rows with missing values from the reference dataset.

        train_args:
            Further arguments to pass to :py:func:`~singler.train_single`.

        classify_args:
            Further arguments to pass to :py:func:`~singler.classify_single`.

        num_threads:
            Number of threads to use for the various steps.

    Returns:
        A :py:class:`~biocframe.BiocFrame.BiocFrame` of labelling results, see :py:func:`~singler.classify_single` for details.

    Examples:
        >>> # Mocking up data with log-normalized expression values:
        >>> import singler
        >>> ref = singler.mock_reference_data()
        >>> test = singler.mock_test_data(ref)
        >>> 
        >>> import scranpy
        >>> ref = scranpy.normalize_rna_counts_se(ref)
        >>> test = scranpy.normalize_rna_counts_se(test)
        >>> 
        >>> # Running the classification:
        >>> pred = singler.annotate_single(test, ref, ref_labels=ref.get_column_data()["label"])
        >>> print(pred)
        >>> import collections
        >>> print(collections.Counter(zip(pred["best"], test.get_column_data()["label"])))
    """

    if isinstance(ref_labels, str):
        warnings.warn(
            "setting 'ref_labels' to a column name of the column data is deprecated",
            category=DeprecationWarning
        )
        ref_labels = ref_data.get_column_data().column(ref_labels)

    test_data, test_features = _clean_matrix(
        test_data,
        test_features,
        assay_type=test_assay_type,
        check_missing=test_check_missing,
        num_threads=num_threads
    )

    ref_data, ref_features = _clean_matrix(
        ref_data,
        ref_features,
        assay_type=ref_assay_type,
        check_missing=ref_check_missing,
        num_threads=num_threads
    )

    # Pre-slicing the test dataset for consistency with annotate_integrated.
    test_data, test_features = _restrict_features(
        test_data,
        test_features,
        ref_features
    )

    built = train_single(
        ref_data=ref_data,
        ref_labels=ref_labels,
        ref_features=ref_features,
        test_features=test_features,
        check_missing=False,
        num_threads=num_threads,
        **train_args,
    )

    return classify_single(
        test_data,
        ref_prebuilt=built,
        **classify_args,
        num_threads=num_threads,
    )