from typing import Any, Literal, Optional, Sequence, Union
import biocutils
import numpy
import knncolle
import mattress
import delayedarray
import warnings
from . import lib_singler as lib
from ._utils import _clean_matrix, _restrict_features, _stable_intersect
from .get_classic_markers import get_classic_markers
from .aggregate_reference import aggregate_reference
[docs]
class TrainedSingleReference:
"""A prebuilt reference object, typically created by
:py:meth:`~singler.train_single.train_single`. This is intended for
advanced users only and should not be serialized.
"""
def __init__(
self,
ptr: int,
full_data: Any,
full_label_codes: numpy.ndarray,
labels: Sequence,
features: Sequence,
markers: dict[Any, dict[Any, Sequence]],
):
self._ptr = ptr
self._full_data = full_data
self._full_label_codes = full_label_codes
self._features = features
self._labels = labels
self._markers = markers
[docs]
def num_markers(self) -> int:
"""
Returns:
Number of markers to be used for classification. This is the
same as the size of the array from :py:meth:`~marker_subset`.
"""
return lib.get_num_markers_from_single_reference(self._ptr)
[docs]
def num_labels(self) -> int:
"""
Returns:
Number of unique labels in this reference.
"""
return lib.get_num_labels_from_single_reference(self._ptr)
@property
def features(self) -> list:
"""The universe of features known to this reference."""
return self._features
@property
def labels(self) -> Sequence:
"""Unique labels in this reference."""
return self._labels
@property
def markers(self) -> dict[Any, dict[Any, list]]:
"""Markers for every pairwise comparison between labels."""
return self._markers
[docs]
def marker_subset(self, indices_only: bool = False) -> Union[numpy.ndarray, list]:
"""
Args:
indices_only:
Whether to return the markers as indices
into :py:attr:`~features`, or as a list of feature identifiers.
Returns:
If ``indices_only = False``, a list of feature identifiers for the markers.
If ``indices_only = True``, a NumPy array containing the integer indices of
features in ``features`` that were chosen as markers.
"""
buffer = lib.get_markers_from_single_reference(self._ptr)
if indices_only:
return buffer
else:
return biocutils.StringList(self._features[i] for i in buffer)
def _markers_from_dict(markers: dict[Any, dict[Any, Sequence]], labels: Sequence, available_features: Sequence):
fmapping = {}
for i, x in enumerate(available_features):
fmapping[x] = i
return outer_instance
[docs]
def train_single(
ref_data: Any,
ref_labels: Sequence,
ref_features: Sequence,
test_features: Optional[Sequence] = None,
assay_type: Union[str, int] = "logcounts",
restrict_to: Optional[Union[set, dict]] = None,
check_missing: bool = True,
markers: Optional[dict[Any, dict[Any, Sequence]]] = None,
marker_method: Literal["classic", "auc", "cohens_d"] = "classic",
num_de: Optional[int] = None,
marker_args: dict = {},
aggregate: bool = False,
aggregate_args: dict = {},
nn_parameters: Optional[knncolle.Parameters] = knncolle.VptreeParameters(),
num_threads: int = 1,
) -> TrainedSingleReference:
"""Build a single reference dataset in preparation for classification.
Args:
ref_data:
A matrix-like object where rows are features, columns are
reference profiles, and each entry is the expression value.
If ``markers`` is not provided, expression should be normalized
and log-transformed in preparation for marker prioritization via
differential expression analyses. Otherwise, any expression values
are acceptable as only the ranking within each column is used.
Alternatively, a
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`
containing such a matrix in one of its assays.
ref_labels:
Sequence of labels for each reference profile, i.e., column in ``ref_data``.
ref_features:
Sequence of identifiers for each feature, i.e., row in ``ref_data``.
test_features:
Sequence of identifiers for each feature in the test dataset.
assay_type:
Assay containing the expression matrix,
if ``ref_data`` is a
:py:class:`~summarizedexperiment.SummarizedExperiment.SummarizedExperiment`.
check_missing:
Whether to check for and remove rows with missing (NaN) values
from ``ref_data``.
restrict_to:
Subset of available features to restrict to. Only features in
``restrict_to`` will be used in the reference building. If
``None``, no restriction is performed.
markers:
Upregulated markers for each pairwise comparison between labels.
Specifically, ``markers[a][b]`` should be a sequence of features
that are upregulated in ``a`` compared to ``b``. All such features
should be present in ``features``, and all labels in ``labels``
should have keys in the inner and outer dictionaries.
marker_method:
Method to identify markers from each pairwise comparisons between labels in ``ref_data``.
If ``classic``, we call :py:func:`~singler.get_classic_markers.get_classic_markers`.
If ``auc`` or ``cohens_d``, we call :py:func:`~scranpy.score_markers.score_markers`.
Only used if ``markers`` is not supplied.
num_de:
Number of differentially expressed genes to use as markers for each pairwise comparison between labels.
If ``None`` and ``marker_method = "classic"``, an appropriate number of genes is determined by :py:func:`~singler.get_classic_markers.get_classic_markers`.
Otherwise, it is set to 10.
Only used if ``markers`` is not supplied.
marker_args:
Further arguments to pass to the chosen marker detection method.
If ``marker_method = "classic"``, this is :py:func:`~singler.get_classic_markers.get_classic_markers`, otherwise it is :py:func:`~scranpy.score_markers.score_markers`.
Only used if ``markers`` is not supplied.
aggregate:
Whether the reference dataset should be aggregated to pseudo-bulk samples for speed, see :py:func:`~singler.aggregate_reference.aggregate_reference` for details.
aggregate_args:
Further arguments to pass to :py:func:`~singler.aggregate_reference.aggregate_reference` when ``aggregate = True``.
nn_parameters:
Algorithm for constructing the neighbor search index, used to
compute scores during classification.
num_threads:
Number of threads to use for reference building.
Returns:
The pre-built reference, ready for use in downstream methods like
:py:meth:`~singler.classify_single_reference.classify_single`.
"""
if isinstance(ref_labels, str):
warnings.warn(
"setting 'ref_labels' to a column name of the column data is deprecated",
category=DeprecationWarning
)
ref_labels = ref_data.get_column_data().column(ref_labels)
ref_data, ref_features = _clean_matrix(
ref_data,
ref_features,
assay_type=assay_type,
check_missing=check_missing,
num_threads=num_threads,
)
if len(set(ref_features)) != len(ref_features):
encountered = set()
keep = []
for i, rg in enumerate(ref_features):
if rg not in encountered:
encountered.add(rg)
keep.append(i)
if len(keep) < len(ref_features):
ref_features = biocutils.subset_sequence(ref_features, keep)
ref_data = delayedarray.DelayedArray(ref_data)[keep,:]
if None in ref_labels:
keep = []
for i, f in enumerate(ref_labels):
if not f is None:
keep.append(i)
ref_data = delayedarray.DelayedArray(ref_data)[:,keep]
ref_labels = biocutils.subset_sequence(ref_labels, keep)
unique_labels, label_idx = biocutils.factorize(ref_labels, sort_levels=True, dtype=numpy.uint32, fail_missing=True)
markers = _identify_genes(
ref_data=ref_data,
ref_features=ref_features,
ref_labels=ref_labels,
unique_labels=unique_labels,
markers=markers,
marker_method=marker_method,
num_de=num_de,
test_features=test_features,
restrict_to=restrict_to,
marker_args=marker_args,
num_threads=num_threads
)
markers_idx = [None] * len(unique_labels)
for outer_i, outer_k in enumerate(unique_labels):
inner_instance = [None] * len(unique_labels)
for inner_i, inner_k in enumerate(unique_labels):
current = markers[outer_k][inner_k]
inner_instance[inner_i] = biocutils.match(current, ref_features, dtype=numpy.uint32)
markers_idx[outer_i] = inner_instance
if test_features is None:
test_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32)
ref_features_idx = numpy.array(range(len(ref_features)), dtype=numpy.uint32)
else:
common_features = _stable_intersect(test_features, ref_features)
test_features_idx = biocutils.match(common_features, test_features, dtype=numpy.uint32)
ref_features_idx = biocutils.match(common_features, ref_features, dtype=numpy.uint32)
if aggregate:
aggr = aggregate_reference(ref_data, ref_labels, ref_features, **aggregate_args)
ref_data = aggr.assay(0)
label_idx = biocutils.match(aggr.get_column_data().get_column("label"), unique_labels, dtype=numpy.uint32)
ref_ptr = mattress.initialize(ref_data)
builder, _ = knncolle.define_builder(nn_parameters)
return TrainedSingleReference(
lib.train_single(
test_features_idx,
ref_ptr.ptr,
ref_features_idx,
label_idx,
markers_idx,
builder.ptr,
num_threads,
),
full_data = ref_data,
full_label_codes = label_idx,
labels = unique_labels,
features = ref_features,
markers = markers,
)
def _identify_genes(ref_data, ref_features, ref_labels, unique_labels, markers, marker_method, test_features, restrict_to, num_de, marker_args, num_threads):
ref_data, ref_features = _restrict_features(ref_data, ref_features, test_features)
ref_data, ref_features = _restrict_features(ref_data, ref_features, restrict_to)
# Deriving the markers from expression data.
if markers is None:
if marker_method == "classic":
markers = get_classic_markers(
ref_data=[ref_data],
ref_labels=[ref_labels],
ref_features=[ref_features],
num_threads=num_threads,
num_de=num_de,
**marker_args,
)
else:
if marker_method == "auc":
compute_auc = True
compute_cohens_d = False
effect_size = "auc"
boundary = 0.5
else:
compute_auc = False
compute_cohens_d = True
effect_size = "cohens_d"
boundary = 0
import scranpy
stats = scranpy.score_markers(
ref_data,
groups=ref_labels,
num_threads=num_threads,
all_pairwise=True,
compute_delta_detected=False,
compute_delta_mean=False,
compute_auc=compute_auc,
compute_cohens_d=compute_cohens_d,
**marker_args
)
pairwise = getattr(stats, effect_size)
if num_de is None:
num_de = 10
markers = {}
for g1, group1 in enumerate(stats.groups):
group_markers = {}
for g2, group2 in enumerate(stats.groups):
if g1 == g2:
group_markers[group2] = biocutils.StringList()
continue
cureffects = pairwise[g2, g1, :] # remember, second dimension is the first group in the comparison.
keep = biocutils.which(cureffects > boundary)
o = numpy.argsort(-cureffects[keep], stable=True)
group_markers[group2] = biocutils.StringList(biocutils.subset_sequence(ref_features, keep[o[:min(len(o), num_de)]]))
markers[group1] = group_markers
return markers
# Validating a user-supplied list of markers.
if not isinstance(markers, dict):
raise ValueError("'markers' should be a list where the labels are keys")
if len(unique_labels) != len(markers):
raise ValueError("'markers' should have length equal to the number of unique labels")
available_features = set(ref_features)
new_markers = {}
for x, y in markers.items():
if x not in unique_labels:
raise ValueError("unknown label '" + x + "' in 'markers'")
if isinstance(y, list):
collected = []
for g in y:
if g in available_features:
collected.append(g)
output = {}
for l in unique_labels:
output[l] = biocutils.StringList()
output[x] = collected
elif isinstance(y, dict):
if len(unique_labels) != len(y):
raise ValueError("each value of 'markers' should have length equal to the number of unique labels")
output = {}
for x_inner, y_inner in y.items():
collected = biocutils.StringList()
for g in y_inner:
if g in available_features:
collected.append(g)
output[x_inner] = collected
else:
raise ValueError("values of 'markers' should be dictionaries")
new_markers[x] = output
return new_markers