Source code for singler.annotate_single

import warnings
from typing import Any, Optional, Sequence, Union

import biocframe
import summarizedexperiment

from .train_single import train_single 
from .classify_single import classify_single
from ._utils import _clean_matrix, _restrict_features



[docs]
def annotate_single(
    test_data: Any,
    ref_data: Any,
    ref_labels: Sequence,
    test_features: Optional[Sequence] = None,
    ref_features: Optional[Sequence] = None,
    test_assay_type: Union[str, int] = 0,
    ref_assay_type: Union[str, int] = 0,
    test_check_missing: bool = False,
    ref_check_missing: bool = True,
    train_args: dict = {},
    classify_args: dict = {},
    num_threads: int = 1,
) -> biocframe.BiocFrame:
    """Annotate a single-cell expression dataset based on the correlation
    of each cell to profiles in a labelled reference.

    Args:
        test_data:
            A matrix-like object representing the test dataset, where rows are
            features and columns are samples (usually cells). Entries should be expression
            values; only the ranking within each column will be used.

            Alternatively, a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
            containing such a matrix in one of its assays. Non-default assay
            types can be specified in ``classify_args``.

        ref_data:
            A matrix-like object representing the reference dataset, where rows
            are features and columns are samples. Entries should be expression values,
            usually log-transformed (see comments for the ``ref`` argument in
            :py:func:`~singler.train_single.train_single`).

            Alternatively, a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
            containing such a matrix in one of its assays. Non-default assay
            types can be specified in ``classify_args``.

        ref_labels:
            Sequence of length equal to the number of columns of ``ref_data``,
            containing the label associated with each column.

        test_features:
            Sequence of length equal to the number of rows in ``test_data``,
            containing the feature identifier for each row. Alternatively
            ``None``, to use the row names of the experiment as features.

        ref_features:
            Sequence of length equal to the number of rows of ``ref_data``,
            containing the feature identifier for each row. Alternatively
            ``None``, to use the row names of the experiment as features.

        test_assay_type:
            Assay containing the expression matrix, if ``test_data`` is a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        ref_assay_type:
            Assay containing the expression matrix, if ``ref_data`` is a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        test_assay_type:
            Whether to remove rows with missing values from the test dataset.

        ref_assay_type:
            Whether to remove rows with missing values from the reference dataset.

        train_args:
            Further arguments to pass to
            :py:func:`~singler.train_single.train_single`.

        classify_args:
            Further arguments to pass to
            :py:func:`~singler.classify_single.classify_single`.

        num_threads:
            Number of threads to use for the various steps.

    Returns:
        A :py:class:`~biocframe.BiocFrame.BiocFrame` of labelling results, see
        :py:func:`~singler.classify_single.classify_single` for details.
    """
    if isinstance(ref_labels, str):
        warnings.warn(
            "setting 'ref_labels' to a column name of the column data is deprecated",
            category=DeprecationWarning
        )
        ref_labels = ref_data.get_column_data().column(ref_labels)

    test_data, test_features = _clean_matrix(
        test_data,
        test_features,
        assay_type=test_assay_type,
        check_missing=test_check_missing,
        num_threads=num_threads
    )

    ref_data, ref_features = _clean_matrix(
        ref_data,
        ref_features,
        assay_type=ref_assay_type,
        check_missing=ref_check_missing,
        num_threads=num_threads
    )

    # Pre-slicing the test dataset for consistency with annotate_integrated.
    test_data, test_features = _restrict_features(
        test_data,
        test_features,
        ref_features
    )

    built = train_single(
        ref_data=ref_data,
        ref_labels=ref_labels,
        ref_features=ref_features,
        test_features=test_features,
        check_missing=False,
        num_threads=num_threads,
        **train_args,
    )

    return classify_single(
        test_data,
        ref_prebuilt=built,
        **classify_args,
        num_threads=num_threads,
    )