Source code for singler.annotate_integrated

from typing import Any, Optional, Sequence, Tuple, Union

import biocframe
import warnings

from ._utils import _clean_matrix, _restrict_features
from .train_single import train_single
from .train_integrated import train_integrated
from .classify_single import classify_single
from .classify_integrated import classify_integrated



[docs]
def annotate_integrated(
    test_data: Any,
    ref_data: Sequence,
    ref_labels: list[Sequence],
    test_features: Optional[Sequence] = None,
    ref_features: Optional[list[Optional[Sequence]]] = None,
    test_assay_type: Union[str, int] = 0,
    ref_assay_type: Union[str, int] = "logcounts",
    test_check_missing: bool = False,
    ref_check_missing: bool = True,
    train_single_args: dict = {},
    classify_single_args: dict = {},
    train_integrated_args: dict = {},
    classify_integrated_args: dict = {},
    num_threads: int = 1,
) -> Tuple[list[biocframe.BiocFrame], biocframe.BiocFrame]:
    """Annotate a single-cell expression dataset based on the correlation
    of each cell to profiles in multiple labelled references, where the
    annotation from each reference is then integrated across references.

    Args:
        test_data:
            A matrix-like object representing the test dataset, where rows are
            features and columns are samples (usually cells). Entries should be
            expression values; only the ranking within each column is used.

            Alternatively, a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
            containing such a matrix in one of its assays.

        ref_data:
            Sequence consisting of one or more of the following:

            - A matrix-like object representing the reference dataset, where rows
              are features and columns are samples. Entries should be expression values,
              usually log-transformed (see comments for the ``ref_data`` argument in
              :py:func:`~singler.train_single.train_single`).
            - A ``SummarizedExperiment`` object containing such a matrix in its assays.

        ref_labels:
            Sequence of the same length as ``ref_data``. The ``i``-th entry
            should be a sequence of length equal to the number of columns of
            ``ref_data[i]``, containing the label associated with each column.

        test_features:
            Sequence of length equal to the number of rows in ``test_data``,
            containing the feature identifier for each row.  Alternatively
            ``None``, to use the row names of the experiment as features.

        ref_features:
            Sequence of the same length as ``ref_data``. The ``i``-th entry
            should be a sequence of length equal to the number of rows of
            ``ref_data[i]``, containing the feature identifier associated with
            each row. It can also be set to ``None`` to use the row names of
            the experiment as features.

            This can also be ``None`` to indicate that the row names should be
            used for all references, assuming ``ref_data`` only contains
            ``SummarizedExperiment`` objects.

        test_assay_type:
            Assay of ``test_data`` containing the expression matrix, if ``test_data`` is a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        test_check_missing:
            Whether to check for and remove missing (i.e., NaN) values from the test dataset.

        ref_assay_type:
            Assay containing the expression matrix for any entry of ``ref_data`` that is a
            :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        ref_check_missing:
            Whether to check for and remove missing (i.e., NaN) values from the reference datasets.

        train_single_args:
            Further arguments to pass to
            :py:func:`~singler.train_single.train_single`.

        classify_single_args:
            Further arguments to pass to
            :py:func:`~singler.classify_single.classify_single`.

        train_integrated_args:
            Further arguments to pass to
            :py:func:`~singler.train_integrated.train_integrated`.

        classify_integrated_args:
            Further arguments to pass to
            :py:func:`~singler.classify_integrated.classify_integrated`.

        num_threads:
            Number of threads to use for the various steps.

    Returns:
        Tuple where the first element contains per-reference results (i.e. a
        list of BiocFrame outputs, roughly equivalent to running
        :py:func:`~singler.annotate_single.annotate_single` on each reference)
        and the second element contains integrated results across references
        (i.e., a BiocFrame from
        :py:func:`~singler.classify_integrated.classify_integrated`).
    """
    nrefs = len(ref_data)
    if nrefs != len(ref_labels):
        raise ValueError("'ref_data' and 'ref_labels' must be the same length")
    if ref_features is None:
        ref_features = [None] * nrefs
    if nrefs != len(ref_features):
        raise ValueError("'ref_data' and 'ref_features' must be the same length")
    if nrefs != len(ref_labels):
        raise ValueError("'ref_data' and 'ref_labels' must be the same length")

    test_data, test_features = _clean_matrix(
        test_data,
        test_features,
        assay_type=test_assay_type,
        check_missing=test_check_missing,
        num_threads=num_threads,
    )

    all_ref_data = []
    all_ref_labels = []
    all_ref_features = []
    for r in range(nrefs):
        curref_labels = ref_labels[r]
        if isinstance(curref_labels, str):
            warnings.warn(
                "setting 'ref_labels' to a column name of the column data is deprecated",
                category=DeprecationWarning
            )
            curref_labels = ref_data[r].get_column_data().column(curref_labels)
        all_ref_labels.append(curref_labels)

        curref_data, curref_features = _clean_matrix(
            ref_data[r],
            ref_features[r],
            assay_type=ref_assay_type,
            check_missing=ref_check_missing,
            num_threads=num_threads,
        )

        all_ref_data.append(curref_data)
        all_ref_features.append(curref_features)

        # Pre-slicing the test dataset so that we force the downstream analysis
        # to only consider genes that are present in all datasets (test and
        # reference). This avoids the warning in classify_integrated().
        test_data, test_features = _restrict_features(
            test_data,
            test_features,
            curref_features
        )

    if test_data.shape[0] == 0:
        raise ValueError("no genes in common across test and reference datasets")

    all_built = []
    all_results = []

    for r in range(nrefs):
        curbuilt = train_single(
            ref_data=all_ref_data[r],
            ref_labels=all_ref_labels[r],
            ref_features=all_ref_features[r],
            test_features=test_features,
            **train_single_args,
            check_missing=False,
            num_threads=num_threads,
        )

        res = classify_single(
            test_data,
            ref_prebuilt=curbuilt,
            **classify_single_args,
            num_threads=num_threads,
        )

        all_built.append(curbuilt)
        all_results.append(res)

    ibuilt = train_integrated(
        test_features=test_features,
        ref_prebuilt=all_built,
        **train_integrated_args,
        num_threads=num_threads,
    )

    ires = classify_integrated(
        test_data=test_data,
        results=all_results,
        integrated_prebuilt=ibuilt,
        **classify_integrated_args,
        num_threads=num_threads,
    )

    return all_results, ires