Source code for singler._train_integrated

from typing import Sequence, Optional, Union

import numpy 
import biocutils
import warnings
import mattress

from ._train_single import TrainedSingleReference
from . import _lib_singler as lib
from ._utils import _stable_union, _stable_intersect, _to_NamedList


[docs] class TrainedIntegratedReferences: """ Integrated references, typically constructed by :py:meth:`~singler.train_integrated`. This is intended for advanced users only and should not be serialized. """ def __init__(self, ptr: int, ref_labels: list, ref_names: Optional[biocutils.Names]): self._ptr = ptr self._labels = ref_labels self._names = ref_names @property def reference_labels(self) -> list: """ List of lists containing the names of the labels for each reference. """ return self._labels @property def reference_names(self) -> Optional[biocutils.Names]: """ Names of the references, or ``None`` if they were unnamed. """ return self._names
[docs] def train_integrated( test_features: Sequence, ref_prebuilt: Union[dict, Sequence, biocutils.NamedList], warn_lost: bool = True, num_threads: int = 1, ) -> TrainedIntegratedReferences: """ Build a set of integrated references for classification of a test dataset. Arguments: test_features: Sequence of features for the test dataset. ref_prebuilt: List of prebuilt references, typically created by calling :py:meth:`~singler.train_single`. warn_lost: Whether to emit a warning if the markers for each reference are not all present in all references. num_threads: Number of threads. Returns: An integrated reference object, for classification with :py:meth:`~singler.classify_integrated`. Examples: >>> # Mocking up data. >>> import singler >>> ref = singler.mock_reference_data(num_replicates=8) >>> ref1 = ref[:,[True, False] * int(ref.shape[1]/2)] >>> ref2 = ref[:,[False, True] * int(ref.shape[1]/2)] >>> >>> cd2 = ref2.get_column_data() >>> label2 = [l.lower() for l in cd2["label"]] # converting to lower case for some variety. >>> cd2.set_column("label", label2, in_place=True) >>> ref2.set_column_data(cd2, in_place=True) >>> >>> import scranpy >>> ref1 = scranpy.normalize_rna_counts_se(ref1) >>> ref2 = scranpy.normalize_rna_counts_se(ref2) >>> >>> # Building a classifier for each reference. >>> test = singler.mock_test_data(ref) >>> built1 = singler.train_single(ref1, ref1.get_column_data()["label"], ref1.get_row_names()) >>> built2 = singler.train_single(ref2, ref2.get_column_data()["label"], ref2.get_row_names()) >>> >>> # Creating an integrated classifier across references. >>> in_built = singler.train_integrated(test.get_row_names(), {"first": built1, "second": built2}) >>> in_built.reference_labels >>> in_built.reference_names """ ref_prebuilt = _to_NamedList(ref_prebuilt) # Checking the genes. if warn_lost: all_refnames = [x.features for x in ref_prebuilt] intersected = set(_stable_intersect(*all_refnames)) for trained in ref_prebuilt: for g in trained.marker_subset(): if g not in intersected: warnings.warn("not all markers in 'ref_prebuilt' are available in each reference") all_inter_test = [] all_inter_ref = [] for i, trained in enumerate(ref_prebuilt): common = _stable_intersect(test_features, trained.features) all_inter_test.append(biocutils.match(common, test_features, dtype=numpy.uint32)) all_inter_ref.append(biocutils.match(common, trained.features, dtype=numpy.uint32)) all_data = [mattress.initialize(x._full_data) for x in ref_prebuilt] # Applying the integration. ibuilt = lib.train_integrated( all_inter_test, [x.ptr for x in all_data], all_inter_ref, [x._full_label_codes for x in ref_prebuilt], [x._ptr for x in ref_prebuilt], num_threads ) return TrainedIntegratedReferences( ptr=ibuilt, ref_labels=[x.labels for x in ref_prebuilt], ref_names=ref_prebuilt.get_names() )