Source code for singler._train_integrated
from typing import Sequence, Optional, Union
import numpy
import biocutils
import warnings
import mattress
from ._train_single import TrainedSingleReference
from . import _lib_singler as lib
from ._utils import _stable_union, _stable_intersect, _to_NamedList
[docs]
class TrainedIntegratedReferences:
"""
Integrated references, typically constructed by :py:meth:`~singler.train_integrated`.
This is intended for advanced users only and should not be serialized.
"""
def __init__(self, ptr: int, ref_labels: list, ref_names: Optional[biocutils.Names]):
self._ptr = ptr
self._labels = ref_labels
self._names = ref_names
@property
def reference_labels(self) -> list:
"""
List of lists containing the names of the labels for each reference.
"""
return self._labels
@property
def reference_names(self) -> Optional[biocutils.Names]:
"""
Names of the references, or ``None`` if they were unnamed.
"""
return self._names
[docs]
def train_integrated(
test_features: Sequence,
ref_prebuilt: Union[dict, Sequence, biocutils.NamedList],
warn_lost: bool = True,
num_threads: int = 1,
) -> TrainedIntegratedReferences:
"""
Build a set of integrated references for classification of a test dataset.
Arguments:
test_features:
Sequence of features for the test dataset.
ref_prebuilt:
List of prebuilt references, typically created by calling :py:meth:`~singler.train_single`.
warn_lost:
Whether to emit a warning if the markers for each reference are not all present in all references.
num_threads:
Number of threads.
Returns:
An integrated reference object, for classification with :py:meth:`~singler.classify_integrated`.
Examples:
>>> # Mocking up data.
>>> import singler
>>> ref = singler.mock_reference_data(num_replicates=8)
>>> ref1 = ref[:,[True, False] * int(ref.shape[1]/2)]
>>> ref2 = ref[:,[False, True] * int(ref.shape[1]/2)]
>>>
>>> cd2 = ref2.get_column_data()
>>> label2 = [l.lower() for l in cd2["label"]] # converting to lower case for some variety.
>>> cd2.set_column("label", label2, in_place=True)
>>> ref2.set_column_data(cd2, in_place=True)
>>>
>>> import scranpy
>>> ref1 = scranpy.normalize_rna_counts_se(ref1)
>>> ref2 = scranpy.normalize_rna_counts_se(ref2)
>>>
>>> # Building a classifier for each reference.
>>> test = singler.mock_test_data(ref)
>>> built1 = singler.train_single(ref1, ref1.get_column_data()["label"], ref1.get_row_names())
>>> built2 = singler.train_single(ref2, ref2.get_column_data()["label"], ref2.get_row_names())
>>>
>>> # Creating an integrated classifier across references.
>>> in_built = singler.train_integrated(test.get_row_names(), {"first": built1, "second": built2})
>>> in_built.reference_labels
>>> in_built.reference_names
"""
ref_prebuilt = _to_NamedList(ref_prebuilt)
# Checking the genes.
if warn_lost:
all_refnames = [x.features for x in ref_prebuilt]
intersected = set(_stable_intersect(*all_refnames))
for trained in ref_prebuilt:
for g in trained.marker_subset():
if g not in intersected:
warnings.warn("not all markers in 'ref_prebuilt' are available in each reference")
all_inter_test = []
all_inter_ref = []
for i, trained in enumerate(ref_prebuilt):
common = _stable_intersect(test_features, trained.features)
all_inter_test.append(biocutils.match(common, test_features, dtype=numpy.uint32))
all_inter_ref.append(biocutils.match(common, trained.features, dtype=numpy.uint32))
all_data = [mattress.initialize(x._full_data) for x in ref_prebuilt]
# Applying the integration.
ibuilt = lib.train_integrated(
all_inter_test,
[x.ptr for x in all_data],
all_inter_ref,
[x._full_label_codes for x in ref_prebuilt],
[x._ptr for x in ref_prebuilt],
num_threads
)
return TrainedIntegratedReferences(
ptr=ibuilt,
ref_labels=[x.labels for x in ref_prebuilt],
ref_names=ref_prebuilt.get_names()
)