singlepp
A C++ library for cell type classification
Loading...
Searching...
No Matches
train_integrated.hpp
Go to the documentation of this file.
1#ifndef SINGLEPP_TRAIN_INTEGRATED_HPP
2#define SINGLEPP_TRAIN_INTEGRATED_HPP
3
4#include "defs.hpp"
5
6#include "tatami/tatami.hpp"
7
8#include "build_reference.hpp"
9#include "Markers.hpp"
10#include "Intersection.hpp"
11#include "utils.hpp"
12
13#include <vector>
14#include <algorithm>
15#include <cstdint>
16#include <memory>
17#include <optional>
18#include <cassert>
19
25namespace singlepp {
26
36template<typename Value_, typename Index_, typename Label_>
41 std::shared_ptr<const tatami::Matrix<Value_, Index_> > ref;
42 const Label_* labels;
44 Index_ test_nrow;
45 std::optional<Intersection<Index_> > intersection;
49};
50
71template<typename Value_, typename Index_, typename Label_>
73 std::shared_ptr<const tatami::Matrix<Value_, Index_> > ref,
74 const Label_* labels,
76) {
78 output.ref = std::move(ref);
79 output.labels = labels;
80 output.markers = std::move(markers);
81 output.test_nrow = output.ref->nrow(); // remember, test and ref are assumed to have the same features.
82 return output;
83}
84
110template<typename Index_, typename Value_, typename Label_>
112 Index_ test_nrow,
113 Intersection<Index_> intersection,
114 std::shared_ptr<const tatami::Matrix<Value_, Index_> > ref,
115 const Label_* labels,
117) {
119 output.ref = std::move(ref);
120 output.labels = labels;
121 output.markers = std::move(markers);
122 output.test_nrow = test_nrow;
123 output.intersection = std::move(intersection);
124 return output;
125}
126
155template<typename Index_, typename Id_, typename Value_, typename Label_>
157 Index_ test_nrow,
158 const Id_* test_id,
159 std::shared_ptr<const tatami::Matrix<Value_, Index_> > ref,
160 const Id_* ref_id,
161 const Label_* labels,
163) {
165 output.ref = std::move(ref);
166 output.labels = labels;
167 output.markers = std::move(markers);
168 output.test_nrow = test_nrow;
169 output.intersection = intersect_genes(test_nrow, test_id, output.ref->nrow(), ref_id);
170 return output;
171}
172
176template<typename Index_>
177struct IntegratedReference {
178 struct DensePerLabel {
179 Index_ num_samples;
180 std::vector<Index_> markers; // indices to 'universe'
181 RankedVector<Index_, Index_> all_ranked; // .second contains indices to 'universe'
182 };
183
184 struct SparsePerLabel {
185 Index_ num_samples;
186 std::vector<Index_> markers; // indices to 'universe'
187 RankedVector<Index_, Index_> negative_ranked, positive_ranked; // .second contains indices to 'universe'
188 std::vector<std::size_t> negative_indptrs, positive_indptrs;
189 };
190
191 std::optional<std::vector<DensePerLabel> > dense;
192 std::optional<std::vector<SparsePerLabel> > sparse;
193};
202template<typename Index_>
204public:
208 TrainedIntegrated(Index_ test_nrow, std::vector<Index_> universe, std::vector<IntegratedReference<Index_> > references) :
209 my_test_nrow(test_nrow),
210 my_universe(std::move(universe)),
211 my_references(std::move(references))
212 {
213 assert(is_sorted_unique(my_universe.size(), my_universe.data()));
214 }
215
216 const auto& references() const {
217 return my_references;
218 }
223private:
224 Index_ my_test_nrow;
225 std::vector<Index_> my_universe;
226 std::vector<IntegratedReference<Index_> > my_references;
227
228public:
232 std::size_t num_references() const {
233 return my_references.size();
234 }
235
239 Index_ test_nrow() const {
240 return my_test_nrow;
241 }
242
248 const std::vector<Index_>& subset() const {
249 return my_universe;
250 }
251
256 std::size_t num_labels(std::size_t r) const {
257 const auto& ref = my_references[r];
258 if (ref.dense.has_value()) {
259 return ref.dense->size();
260 } else {
261 return ref.sparse->size();
262 }
263 }
264
269 std::size_t num_profiles(std::size_t r) const {
270 std::size_t num_prof = 0;
271 const auto& ref = my_references[r];
272 if (ref.dense.has_value()) {
273 for (const auto& lab : *(ref.dense)) {
274 num_prof += sanisizer::sum<std::size_t>(num_prof, lab.num_samples);
275 }
276 } else {
277 for (const auto& lab : *(ref.sparse)) {
278 num_prof += sanisizer::sum<std::size_t>(num_prof, lab.num_samples);
279 }
280 }
281 return num_prof;
282 }
283};
284
295
299template<bool ref_sparse_, typename Value_, typename Index_, typename Label_>
300void train_integrated_per_reference_simple(
302 const std::vector<Index_>& universe,
303 const std::vector<Index_>& remap_test_to_universe,
304 const TrainIntegratedOptions& options,
305 const std::vector<Index_>& positions,
306 std::vector<std::vector<RankedVector<Index_, Index_> > >& out_ranked,
307 typename std::conditional<ref_sparse_, std::vector<std::vector<RankedVector<Index_, Index_> > >&, bool>::type other_ranked
308) {
309 const auto& ref = *(input.ref);
310 const auto NC = ref.ncol();
311 const auto num_universe = universe.size();
312
313 tatami::parallelize([&](int, Index_ start, Index_ len) {
314 auto vbuffer = sanisizer::create<std::vector<Value_> >(num_universe);
315 auto ibuffer = [&](){
316 if constexpr(ref_sparse_) {
317 return sanisizer::create<std::vector<Index_> >(num_universe);
318 } else {
319 return false;
320 }
321 }();
322
323 RankedVector<Value_, Index_> tmp_ranked;
324 tmp_ranked.reserve(num_universe);
325
326 // 'universe' technically refers to the row indices of the test matrix,
327 // but in simple mode, the rows of the test and reference are the same, so we can use it directly here.
328 tatami::VectorPtr<Index_> universe_ptr(tatami::VectorPtr<Index_>{}, &universe);
329 auto ext = tatami::consecutive_extractor<ref_sparse_>(ref, false, start, len, std::move(universe_ptr));
330
331 for (Index_ c = start, end = start + len; c < end; ++c) {
332 tmp_ranked.clear();
333
334 if constexpr(ref_sparse_) {
335 auto info = ext->fetch(vbuffer.data(), ibuffer.data());
336 for (I<decltype(info.number)> i = 0; i < info.number; ++i) {
337 const auto remapped = remap_test_to_universe[info.index[i]];
338 assert(sanisizer::is_less_than(remapped, num_universe));
339 tmp_ranked.emplace_back(info.value[i], remapped);
340 }
341 } else {
342 auto ptr = ext->fetch(vbuffer.data());
343 for (I<decltype(num_universe)> i = 0; i < num_universe; ++i) {
344 tmp_ranked.emplace_back(ptr[i], i); // a.k.a. remap_test_to_universe[universe[i]].
345 }
346 }
347
348 std::sort(tmp_ranked.begin(), tmp_ranked.end());
349
350 if constexpr(ref_sparse_) {
351 const auto tStart = tmp_ranked.begin(), tEnd = tmp_ranked.end();
352 auto zero_ranges = find_zero_ranges<Value_, Index_>(tStart, tEnd);
353 simplify_ranks<Value_, Index_>(tStart, zero_ranges.first, out_ranked[input.labels[c]][positions[c]]);
354 simplify_ranks<Value_, Index_>(zero_ranges.second, tEnd, other_ranked[input.labels[c]][positions[c]]);
355 } else {
356 simplify_ranks(tmp_ranked, out_ranked[input.labels[c]][positions[c]]);
357 }
358 }
359 }, NC, options.num_threads);
360}
361
362template<bool ref_sparse_, typename Value_, typename Index_, typename Label_>
363void train_integrated_per_reference_intersect(
364 const TrainIntegratedInput<Value_, Label_, Index_>& input,
365 const std::vector<Index_>& remap_test_to_universe,
366 const Index_ test_nrow,
367 const TrainIntegratedOptions& options,
368 const std::vector<Index_>& positions,
369 std::vector<std::vector<RankedVector<Index_, Index_> > >& out_ranked,
370 typename std::conditional<ref_sparse_, std::vector<std::vector<RankedVector<Index_, Index_> > >&, bool>::type other_ranked
371) {
372 const auto& ref = *(input.ref);
373 const auto NC = ref.ncol();
374
375 std::vector<Index_> ref_subset;
376 sanisizer::reserve(ref_subset, input.intersection->size());
377 auto remap_ref_subset_to_universe = sanisizer::create<std::vector<Index_> >(ref.nrow(), test_nrow); // all entries of remap_test_to_universe are less than test_nrow.
378 for (const auto& pair : *(input.intersection)) {
379 const auto rdex = remap_test_to_universe[pair.first];
380 if (rdex != test_nrow) {
381 ref_subset.push_back(pair.second);
382 remap_ref_subset_to_universe[pair.second] = rdex;
383 }
384 }
385 std::sort(ref_subset.begin(), ref_subset.end());
386
387 typename std::conditional<ref_sparse_, bool, std::vector<Index_> >::type remap_dense_to_universe;
388 if constexpr(!ref_sparse_) {
389 remap_dense_to_universe.reserve(ref_subset.size());
390 for (auto r : ref_subset) {
391 remap_dense_to_universe.push_back(remap_ref_subset_to_universe[r]);
392 }
393 }
394
395 tatami::parallelize([&](int, Index_ start, Index_ len) {
396 const auto ref_subset_size = ref_subset.size();
397 auto vbuffer = sanisizer::create<std::vector<Value_> >(ref_subset_size);
398 auto ibuffer = [&]() {
399 if constexpr(ref_sparse_) {
400 return sanisizer::create<std::vector<Index_> >(ref_subset_size);
401 } else {
402 return false;
403 }
404 }();
405
406 RankedVector<Value_, Index_> tmp_ranked;
407 tmp_ranked.reserve(ref_subset_size);
408 tatami::VectorPtr<Index_> to_extract_ptr(tatami::VectorPtr<Index_>{}, &ref_subset);
409 auto ext = tatami::consecutive_extractor<ref_sparse_>(ref, false, start, len, std::move(to_extract_ptr));
410
411 for (Index_ c = start, end = start + len; c < end; ++c) {
412 tmp_ranked.clear();
413
414 if constexpr(ref_sparse_) {
415 auto info = ext->fetch(vbuffer.data(), ibuffer.data());
416 for (I<decltype(info.number)> i = 0; i < info.number; ++i) {
417 tmp_ranked.emplace_back(info.value[i], remap_ref_subset_to_universe[info.index[i]]);
418 }
419 } else {
420 auto ptr = ext->fetch(vbuffer.data());
421 for (I<decltype(ref_subset_size)> i = 0; i < ref_subset_size; ++i) {
422 tmp_ranked.emplace_back(ptr[i], remap_dense_to_universe[i]);
423 }
424 }
425
426 std::sort(tmp_ranked.begin(), tmp_ranked.end());
427
428 if constexpr(ref_sparse_) {
429 const auto tStart = tmp_ranked.begin(), tEnd = tmp_ranked.end();
430 auto zero_ranges = find_zero_ranges<Value_, Index_>(tStart, tEnd);
431 simplify_ranks<Value_, Index_>(tStart, zero_ranges.first, out_ranked[input.labels[c]][positions[c]]);
432 simplify_ranks<Value_, Index_>(zero_ranges.second, tEnd, other_ranked[input.labels[c]][positions[c]]);
433 } else {
434 simplify_ranks(tmp_ranked, out_ranked[input.labels[c]][positions[c]]);
435 }
436 }
437 }, NC, options.num_threads);
438}
453template<typename Value_, typename Index_, typename Label_>
455
456 // Checking that the number of genes in the test dataset are consistent.
457 Index_ test_nrow = 0;
458 if (inputs.size()) {
459 test_nrow = inputs.front().test_nrow;
460 for (const auto& in : inputs) {
461 if (!sanisizer::is_equal(in.test_nrow, test_nrow)) {
462 throw std::runtime_error("inconsistent number of rows in the test dataset across entries of 'inputs'");
463 }
464 }
465 }
466
467 // For references with intersections, we need to map the marker indices to the test rows, for comparability across references.
468 const auto nrefs = inputs.size();
469 std::vector<std::vector<Index_> > remap_intersection_to_test_index;
470 for (I<decltype(nrefs)> r = 0; r < nrefs; ++r) {
471 if (inputs[r].intersection.has_value()) {
472 sanisizer::resize(remap_intersection_to_test_index, nrefs);
473 break;
474 }
475 }
476
477 // Identify the union of all marker genes as the universe, but excluding those markers that are not present in all intersections.
478 // Specifically, 'universe' contains sorted and unique row indices for the test matrix, where 'remap_test_to_universe[universe[i]] == i'.
479 // 'remap_test_to_universe[k]' defaults to the max number of rows in the test if 'k' is not present in all intersections.
480 std::vector<Index_> universe;
481 auto remap_test_to_universe = sanisizer::create<std::vector<Index_> >(test_nrow, test_nrow);
482 {
483 auto present = sanisizer::create<std::vector<char> >(test_nrow);
484 auto count_refs = sanisizer::create<std::vector<I<decltype(nrefs)> > >(test_nrow);
485 universe.reserve(test_nrow);
486
487 for (I<decltype(nrefs)> r = 0; r < nrefs; ++r) {
488 const auto& markers = inputs[r].markers;
489 const auto& inter = inputs[r].intersection;
490
491 if (inter.has_value()) {
492 auto& cur_test_remap = remap_intersection_to_test_index[r];
493 sanisizer::resize(cur_test_remap, inputs[r].ref->nrow(), test_nrow); // again, default to the max number of rows if not present in the current intersection.
494 for (const auto& pp : *inter) {
495 cur_test_remap[pp.second] = pp.first;
496 count_refs[pp.first] += 1;
497 }
498
499 for (const auto& labmrk : markers) {
500 for (const auto y : labmrk) {
501 const auto ty = cur_test_remap[y];
502 if (ty != test_nrow && !present[ty]) {
503 present[ty] = true;
504 universe.push_back(ty);
505 }
506 }
507 }
508
509 } else {
510 for (const auto& labmrk : markers) {
511 for (const auto y : labmrk) {
512 if (!present[y]) {
513 present[y] = true;
514 universe.push_back(y);
515 }
516 }
517 }
518
519 for (auto& x : count_refs) {
520 x += 1;
521 }
522 }
523 }
524
525 std::sort(universe.begin(), universe.end());
526 const auto num_universe = universe.size();
527 I<decltype(num_universe)> keep = 0;
528 for (I<decltype(num_universe)> u = 0; u < num_universe; ++u) {
529 const auto marker = universe[u];
530 if (count_refs[marker] == nrefs) {
531 universe[keep] = marker;
532 remap_test_to_universe[marker] = keep;
533 ++keep;
534 }
535 }
536 universe.resize(keep);
537 universe.shrink_to_fit();
538 }
539
540 // Remapping the per-label markers for this reference to refer to our universe.
541 auto references = sanisizer::create<std::vector<IntegratedReference<Index_> > >(nrefs);
542 for (I<decltype(nrefs)> r = 0; r < nrefs; ++r) {
543 const auto& curinput = inputs[r];
544 const auto& currefmarkers = curinput.markers;
545 const auto nlabels = currefmarkers.size();
546 auto& currefout = references[r];
547
548 const bool is_sparse = curinput.ref->is_sparse();
549 if (is_sparse) {
550 currefout.sparse.emplace(sanisizer::as_size_type<I<decltype(*(currefout.sparse))> >(nlabels));
551 } else {
552 currefout.dense.emplace(sanisizer::as_size_type<I<decltype(*(currefout.dense))> >(nlabels));
553 }
554
555 auto get_markers = [&](I<decltype(nlabels)> l) -> std::vector<Index_>& {
556 if (is_sparse) {
557 return (*(currefout.sparse))[l].markers;
558 } else {
559 return (*(currefout.dense))[l].markers;
560 }
561 };
562
563 if (curinput.intersection.has_value()) {
564 auto& cur_test_remap = remap_intersection_to_test_index[r];
565 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
566 const auto& curlabmarkers = currefmarkers[l];
567 auto& markers = get_markers(l);
568 markers.reserve(curlabmarkers.size());
569 for (const auto y : curlabmarkers) {
570 const auto ty = cur_test_remap[y];
571 if (ty != test_nrow) { // ignoring marker that can't be mapped via the current intersection.
572 const auto universe_index = remap_test_to_universe[ty];
573 if (universe_index != test_nrow) { // ignoring genes not present in all intersections.
574 markers.push_back(universe_index);
575 }
576 }
577 }
578 }
579
580 } else {
581 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
582 const auto& curlabmarkers = currefmarkers[l];
583 auto& markers = get_markers(l);
584 markers.reserve(curlabmarkers.size());
585 for (const auto y : curlabmarkers) {
586 const auto universe_index = remap_test_to_universe[y];
587 if (universe_index != test_nrow) { // ignoring genes not present in all intersections.
588 markers.push_back(universe_index);
589 }
590 }
591 }
592 }
593 }
594
595 // Not needed after this so we try to free its allocation for recycling in the next malloc call.
596 // Specifically, I want to give a chance for this memory to be re-used in train_integrated_per_reference_intersect.
597 remap_intersection_to_test_index.clear();
598
599 // Now, we create ranked vectors for each profile in the reference.
600 for (I<decltype(nrefs)> r = 0; r < nrefs; ++r) {
601 const auto& curinput = inputs[r];
602 auto& currefout = references[r];
603
604 const Index_ NC = curinput.ref->ncol();
605 if (NC == 0) {
606 throw std::runtime_error("reference dataset must have at least one column");
607 }
608 std::vector<Index_> positions;
609 sanisizer::reserve(positions, NC);
610
611 const auto nlabels = sanisizer::sum<std::size_t>(*std::max_element(curinput.labels, curinput.labels + NC), 1);
612 auto samples_per_label = sanisizer::create<std::vector<Index_> >(nlabels);
613 for (Index_ c = 0; c < NC; ++c) {
614 auto& pos = samples_per_label[curinput.labels[c]];
615 positions.push_back(pos);
616 ++pos;
617 }
618
619 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
620 if (samples_per_label[l] == 0) {
621 throw std::runtime_error("no profiles available for label " + std::to_string(l) + " in reference " + std::to_string(r));
622 }
623 }
624
625 if (!sanisizer::is_equal(curinput.markers.size(), nlabels)) {
626 throw std::runtime_error("'markers' length should be equal to the number of unique labels");
627 }
628
629 if (curinput.ref->is_sparse()) {
630 auto negative_ranked = sanisizer::create<std::vector<std::vector<RankedVector<Index_, Index_> > > >(nlabels);
631 auto positive_ranked = sanisizer::create<std::vector<std::vector<RankedVector<Index_, Index_> > > >(nlabels);
632 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
633 const auto num_samples = samples_per_label[l];
634 sanisizer::resize(negative_ranked[l], num_samples);
635 sanisizer::resize(positive_ranked[l], num_samples);
636 }
637
638 if (curinput.intersection) {
639 train_integrated_per_reference_intersect<true>(curinput, remap_test_to_universe, test_nrow, options, positions, negative_ranked, positive_ranked);
640 } else {
641 train_integrated_per_reference_simple<true, Value_>(curinput, universe, remap_test_to_universe, options, positions, negative_ranked, positive_ranked);
642 }
643
644 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
645 auto& curlabout = (*(currefout.sparse))[l];
646 const auto num_samples = samples_per_label[l];
647 curlabout.num_samples = num_samples;
648
649 I<decltype(curlabout.negative_ranked.size())> num_neg = 0;
650 for (const auto& x : negative_ranked[l]) {
651 num_neg = sanisizer::sum<I<decltype(num_neg)> >(num_neg, x.size());
652 }
653
654 I<decltype(curlabout.positive_ranked.size())> num_pos = 0;
655 for (const auto& x : positive_ranked[l]) {
656 num_pos = sanisizer::sum<I<decltype(num_pos)> >(num_pos, x.size());
657 }
658
659 curlabout.negative_ranked.reserve(num_neg);
660 curlabout.negative_indptrs.reserve(sanisizer::sum<I<decltype(curlabout.negative_indptrs.size())> >(num_samples, 1));
661 curlabout.negative_indptrs.push_back(0);
662 for (const auto& x : negative_ranked[l]) {
663 curlabout.negative_ranked.insert(curlabout.negative_ranked.end(), x.begin(), x.end());
664 curlabout.negative_indptrs.push_back(curlabout.negative_ranked.size());
665 }
666
667 curlabout.positive_ranked.reserve(num_pos);
668 curlabout.positive_indptrs.reserve(sanisizer::sum<I<decltype(curlabout.positive_indptrs.size())> >(num_samples, 1));
669 curlabout.positive_indptrs.push_back(0);
670 for (const auto& x : positive_ranked[l]) {
671 curlabout.positive_ranked.insert(curlabout.positive_ranked.end(), x.begin(), x.end());
672 curlabout.positive_indptrs.push_back(curlabout.positive_ranked.size());
673 }
674 }
675
676 } else {
677 auto out_ranked = sanisizer::create<std::vector<std::vector<RankedVector<Index_, Index_> > > >(nlabels);
678 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
679 const auto num_samples = samples_per_label[l];
680 sanisizer::resize(out_ranked[l], num_samples);
681 }
682
683 if (curinput.intersection) {
684 train_integrated_per_reference_intersect<false>(curinput, remap_test_to_universe, test_nrow, options, positions, out_ranked, true);
685 } else {
686 train_integrated_per_reference_simple<false, Value_>(curinput, universe, remap_test_to_universe, options, positions, out_ranked, true);
687 }
688
689 for (I<decltype(nlabels)> l = 0; l < nlabels; ++l) {
690 auto& curlabout = (*(currefout.dense))[l];
691 curlabout.num_samples = samples_per_label[l];
692 curlabout.all_ranked.reserve(sanisizer::product<I<decltype(curlabout.all_ranked.size())> >(universe.size(), curlabout.num_samples));
693 for (const auto& x : out_ranked[l]) {
694 curlabout.all_ranked.insert(curlabout.all_ranked.end(), x.begin(), x.end());
695 }
696 }
697 }
698 }
699
700 return TrainedIntegrated<Index_>(test_nrow, std::move(universe), std::move(references));
701}
702
703}
704
705#endif
Create an intersection of genes.
Define type aliases for marker lists.
Classifier that integrates multiple reference datasets.
Definition train_integrated.hpp:203
Index_ test_nrow() const
Definition train_integrated.hpp:239
std::size_t num_references() const
Definition train_integrated.hpp:232
const std::vector< Index_ > & subset() const
Definition train_integrated.hpp:248
std::size_t num_profiles(std::size_t r) const
Definition train_integrated.hpp:269
std::size_t num_labels(std::size_t r) const
Definition train_integrated.hpp:256
Common definitions for singlepp.
Cell type classification using the SingleR algorithm in C++.
Definition classify_single.hpp:20
Intersection< Index_ > intersect_genes(Index_ test_nrow, const Id_ *test_id, Index_ ref_nrow, const Id_ *ref_id)
Definition Intersection.hpp:54
std::vector< std::vector< Index_ > > PerLabelMarkers
Definition Markers.hpp:56
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input(std::shared_ptr< const tatami::Matrix< Value_, Index_ > > ref, const Label_ *labels, PerLabelMarkers< Index_ > markers)
Definition train_integrated.hpp:72
std::vector< std::pair< Index_, Index_ > > Intersection
Definition Intersection.hpp:35
TrainedIntegrated< Index_ > train_integrated(const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &inputs, const TrainIntegratedOptions &options)
Definition train_integrated.hpp:454
Input to train_integrated().
Definition train_integrated.hpp:37
Options for train_integrated().
Definition train_integrated.hpp:288
int num_threads
Definition train_integrated.hpp:293