Source code for singler._classify_integrated

from typing import Any, Sequence, Union

import biocutils
import biocframe 
import mattress
import summarizedexperiment
import numpy
import warnings

from . import _lib_singler as lib
from ._train_integrated import TrainedIntegratedReferences
from ._utils import _to_NamedList



[docs]
def classify_integrated(
    test_data: Any,
    results: Union[dict, Sequence, biocutils.NamedList],
    integrated_prebuilt: TrainedIntegratedReferences,
    assay_type: Union[str, int] = 0,
    quantile: float = 0.8,
    use_fine_tune: bool = True,
    fine_tune_threshold: float = 0.05,
    num_threads: int = 1,
) -> biocframe.BiocFrame:
    """Integrate classification results across multiple references for a single test dataset.

    Args:
        test_data:
            A matrix-like object where each row is a feature and each column is a test sample (usually a cell), containing expression values.
            Normalized and/or transformed expression values are also acceptable as only the ranking is used within this function.

            Alternatively, a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment` containing such a matrix in one of its assays.

        results:
            Sequence of :py:class:`~biocframe.BiocFrame.BiocFrame` objects.
            Each ``BiocFrame`` should contain classification results generated by running :py:func:`~singler.classify_single` on ``test_data`` with one of the references.
            References should be in the same order as that used to construct ``integrated_prebuilt``.

            Alternatively, a dictionary where each value is a ``BiocFrame`` and each key is the name of a reference.
            Names should be the same as those used to construct ``integrated_prebuilt``.

            Alternatively, a :py:class:`~biocutils.NamedList.NamedList` where each entry is a ``BiocFrame``. 
            Names should be the same as those used to construct ``integrated_prebuilt``.

        integrated_prebuilt:
            Integrated reference object, constructed with :py:func:`~singler.train_integrated`.

        assay_type:
            Assay containing the expression matrix, if ``test_data`` is a :py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.

        quantile:
            Quantile of the correlation distribution for computing the score for each label.
            Larger values increase sensitivity of matches at the expense of similarity to the average behavior of each label.

        use_fine_tune:
            Whether fine-tuning should be performed.
            This improves accuracy for distinguishing between references with similar best labels but requires more computational work.

        fine_tune_threshold:
            Maximum difference from the maximum correlation to use in fine-tuning.
            All references above this threshold are used for another round of fine-tuning.

        num_threads:
            Number of threads to use during classification.

    Returns:
        A :py:class:`~biocframe.BiocFrame.BiocFrame` with one row per column in ``test_data``.
        It contains the following columns:

        - ``best_label``: list containing the assigned label in the best reference for each test sample.
        - ``best_reference``: an integer NumPy array containing the index of the best reference for each test sample.
           Each index refers to a position in ``results`` or ``ref_prebuilt`` in :py:func:`~singler.train_integrated.train_integrated`.
        - ``scores``: a nested ``BiocFrame`` where each column corresponds to a reference. 
          Each column is another nested ``BiocFrame`` and contains the scores across labels for its corresponding reference.
        - ``delta``: a double-precision NumPy array containing the difference in scores between the best and second-best reference.

    Examples:
        >>> # Mocking up data.
        >>> import singler
        >>> ref = singler.mock_reference_data(num_replicates=8)
        >>> ref1 = ref[:,[True, False] * int(ref.shape[1]/2)]
        >>> ref2 = ref[:,[False, True] * int(ref.shape[1]/2)]
        >>> 
        >>> cd2 = ref2.get_column_data()
        >>> label2 = [l.lower() for l in cd2["label"]] # converting to lower-case for some variety.
        >>> cd2.set_column("label", label2, in_place=True)
        >>> ref2.set_column_data(cd2, in_place=True)
        >>> 
        >>> import scranpy
        >>> ref1 = scranpy.normalize_rna_counts_se(ref1)
        >>> ref2 = scranpy.normalize_rna_counts_se(ref2)
        >>> 
        >>> # Performing classification within each reference.
        >>> test = singler.mock_test_data(ref)
        >>> 
        >>> built1 = singler.train_single(ref1, ref1.get_column_data()["label"], ref1.get_row_names())
        >>> res1 = singler.classify_single(test, built1)
        >>> built2 = singler.train_single(ref2, ref2.get_column_data()["label"], ref2.get_row_names())
        >>> res2 = singler.classify_single(test, built2)
        >>> 
        >>> # Combining results across references.
        >>> in_built = singler.train_integrated(test.get_row_names(), [built1, built2])
        >>> in_res = singler.classify_integrated(test, [res1, res2], in_built)
        >>> print(in_res)
    """

    if isinstance(test_data, summarizedexperiment.SummarizedExperiment):
        test_data = test_data.assay(assay_type)

    results = _to_NamedList(results)
    ref_labs = integrated_prebuilt.reference_labels
    if len(results) != len(ref_labs):
        raise ValueError("length of 'results' should equal the number of references")
    if results.get_names() != integrated_prebuilt.reference_names:
        warnings.warn("'ref_features' and 'ref_data' should have the same keys/names")

    collated = []
    for i, curres in enumerate(results):
        if test_data.shape[1] != curres.shape[0]:
            raise ValueError("numbers of cells in 'results' are not identical")
        available = set(ref_labs[i])
        curbest = curres.column("best")
        for l in curbest:
            if l not in available:
                raise ValueError("not all labels in 'results' are present in the corresponding reference")
        collated.append(biocutils.match(curbest, ref_labs[i], dtype=numpy.uint32))

    test_ptr = mattress.initialize(test_data)
    best_ref, raw_scores, delta = lib.classify_integrated(
        test_ptr.ptr,
        collated,
        integrated_prebuilt._ptr,
        quantile,
        use_fine_tune,
        fine_tune_threshold,
        num_threads
    ) 

    by_ref = {}
    for i, b in enumerate(best_ref):
        if b not in by_ref:
            by_ref[b] = []
        by_ref[b].append(i)
    best_label = [None] * test_data.shape[1]
    for ref, which in by_ref.items():
        curbest = results[ref].column("best")
        for i in which:
            best_label[i] = curbest[i]

    all_refs = [str(i) for i in range(len(raw_scores))]
    scores = {}
    for i, l in enumerate(all_refs):
        scores[l] = biocframe.BiocFrame({ "label": results[i].column("best"), "score": raw_scores[i] })
    scores_df = biocframe.BiocFrame(scores, number_of_rows=test_data.shape[1], column_names=all_refs)

    return biocframe.BiocFrame({
        "best_label": best_label,
        "best_reference": best_ref,
        "scores": scores_df,
        "delta": delta,
    })