singlepp
A C++ library for cell type classification
Loading...
Searching...
No Matches
train_integrated.hpp
Go to the documentation of this file.
1#ifndef SINGLEPP_TRAIN_INTEGRATED_HPP
2#define SINGLEPP_TRAIN_INTEGRATED_HPP
3
4#include "defs.hpp"
5
6#include "scaled_ranks.hpp"
7#include "train_single.hpp"
8#include "Intersection.hpp"
9
10#include <vector>
11#include <unordered_set>
12#include <unordered_map>
13#include <algorithm>
14#include <cstdint>
15#include <memory>
16
22namespace singlepp {
23
33template<typename Value_ = DefaultValue, typename Index_ = DefaultIndex, typename Label_ = DefaultLabel>
39
40 const tatami::Matrix<Value_, Index_>* ref;
41
42 const Label_* labels;
43
44 std::vector<std::vector<Index_> > markers;
45
46 bool with_intersection = false;
47
49
54};
55
74template<typename Value_, typename Index_, typename Label_, typename Float_>
76 const tatami::Matrix<Value_, Index_>& ref,
77 const Label_* labels,
79{
81 output.test_nrow = ref.nrow(); // remember, test and ref are assumed to have the same features.
82 output.ref = &ref;
83 output.labels = labels;
84
85 const auto& subset = trained.get_subset();
86 const auto& old_markers = trained.get_markers();
87 size_t nlabels = old_markers.size();
88
89 // Adding the markers for each label, indexed according to their
90 // position in the test matrix. This assumes that 'mat_subset' is
91 // appropriately specified to contain the test's row indices.
92 auto& new_markers = output.markers;
93 new_markers.reserve(nlabels);
94 std::unordered_set<Index_> unified;
95
96 for (size_t i = 0; i < nlabels; ++i) {
97 unified.clear();
98 for (const auto& x : old_markers[i]) {
99 unified.insert(x.begin(), x.end());
100 }
101 new_markers.emplace_back(unified.begin(), unified.end());
102 auto& cur_new_markers = new_markers.back();
103 for (auto& y : cur_new_markers) {
104 y = subset[y];
105 }
106 }
107
108 return output;
109}
110
134template<typename Index_, typename Value_, typename Label_, typename Float_>
138 const tatami::Matrix<Value_, Index_>& ref,
139 const Label_* labels,
141{
143 output.test_nrow = test_nrow;
144 output.ref = &ref;
145 output.labels = labels;
146
147 // Updating the markers so that they point to rows of the test matrix.
148 const auto& old_markers = trained.get_markers();
149 size_t nlabels = old_markers.size();
150 auto& new_markers = output.markers;
151 new_markers.resize(nlabels);
152
153 const auto& test_subset = trained.get_test_subset();
154 std::unordered_set<Index_> unified;
155
156 for (size_t i = 0; i < nlabels; ++i) {
157 const auto& cur_old_markers = old_markers[i];
158
159 unified.clear();
160 for (const auto& x : cur_old_markers) {
161 unified.insert(x.begin(), x.end());
162 }
163
165 cur_new_markers.reserve(unified.size());
166 for (auto y : unified) {
167 cur_new_markers.push_back(test_subset[y]);
168 }
169 }
170
171 output.with_intersection = true;
172 output.user_intersection = &intersection;
173 return output;
174}
175
179// For back-compatibility only.
180template<typename Index_, typename Value_, typename Label_, typename Float_>
183 const tatami::Matrix<Value_, Index_>& ref,
184 const Label_* labels,
186{
188}
219template<typename Index_, typename Id_, typename Value_, typename Label_, typename Float_>
222 const Id_* test_id,
223 const tatami::Matrix<Value_, Index_>& ref,
224 const Id_* ref_id,
225 const Label_* labels,
227{
230 output.user_intersection = NULL;
231 output.auto_intersection.swap(intersection);
232 return output;
233}
234
239template<typename Index_>
241public:
245 size_t num_references() const {
246 return markers.size();
247 }
248
253 size_t num_labels(size_t r) const {
254 return markers[r].size();
255 }
256
261 size_t num_profiles(size_t r) const {
262 size_t n = 0;
263 for (const auto& ref : ranked[r]) {
264 n += ref.size();
265 }
266 return n;
267 }
268
269public:
273 // Technically this should be private, but it's a pain to add
274 // templated friend functions, so I can't be bothered.
276 std::vector<Index_> universe; // To be used by classify_integrated() for indexed extraction.
277
278 std::vector<uint8_t> check_availability;
279 std::vector<std::unordered_set<Index_> > available; // indices to 'universe'
280 std::vector<std::vector<std::vector<Index_> > > markers; // indices to 'universe'
281 std::vector<std::vector<std::vector<internal::RankedVector<Index_, Index_> > > > ranked; // .second contains indices to 'universe'
285};
286
297
301namespace internal {
302
303template<typename Value_, typename Index_, typename Input_>
305 size_t ref_i,
308 const std::unordered_map<Index_, Index_> remap_to_universe,
310{
311 auto curlab = curinput.labels;
312 const auto& ref = *(curinput.ref);
313
314 // Reindexing the markers so that they contain indices into to the universe.
315 auto& curmarkers = output.markers[ref_i];
316 if constexpr(std::is_const<Input_>::value) {
317 curmarkers.swap(curinput.markers);
318 } else {
319 curmarkers = curinput.markers;
320 }
321 for (auto& outer : curmarkers) {
322 for (auto& x : outer) {
323 x = remap_to_universe.find(x)->second;
324 }
325 }
326
327 // Pre-allocating the vectors of pre-ranked expression.
328 auto& cur_ranked = output.ranked[ref_i];
329 std::vector<Index_> positions;
330 {
331 size_t nlabels = curmarkers.size();
332 Index_ NC = ref.ncol();
333 positions.reserve(NC);
334
335 std::vector<Index_> samples_per_label(nlabels);
336 for (Index_ c = 0; c < NC; ++c) {
337 auto& pos = samples_per_label[curlab[c]];
338 positions.push_back(pos);
339 ++pos;
340 }
341
342 cur_ranked.resize(nlabels);
343 for (size_t l = 0; l < nlabels; ++l) {
345 }
346 }
347
348 if (!curinput.with_intersection) {
349 // The universe is guaranteed to be sorted and unique, see its derivation
350 // in internal::train_integrated() below. This means we can directly use it
351 // for indexed extraction from a tatami::Matrix.
352 tatami::VectorPtr<Index_> universe_ptr(tatami::VectorPtr<Index_>{}, &(output.universe));
353
354 tatami::parallelize([&](int, Index_ start, Index_ len) {
355 std::vector<Value_> buffer(output.universe.size());
356 internal::RankedVector<Value_, Index_> tmp_ranked;
357 tmp_ranked.reserve(output.universe.size());
358 auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, universe_ptr);
359
360 for (Index_ c = start, end = start + len; c < end; ++c) {
361 auto ptr = ext->fetch(buffer.data());
362
363 tmp_ranked.clear();
364 for (int i = 0, end = output.universe.size(); i < end; ++i, ++ptr) {
365 tmp_ranked.emplace_back(*ptr, i);
366 }
367 std::sort(tmp_ranked.begin(), tmp_ranked.end());
368
371 }
372 }, ref.ncol(), options.num_threads);
373
374 } else {
375 output.check_availability[ref_i] = 1;
376
377 // Need to remap from indices on the test matrix to those in the current reference matrix
378 // so that we can form an appropriate vector for indexed tatami extraction.
379 const auto& intersection = (curinput.user_intersection == NULL ? curinput.auto_intersection : *(curinput.user_intersection));
380 std::unordered_map<Index_, Index_> intersection_map;
381 intersection_map.reserve(intersection.size());
382 for (const auto& in : intersection) {
383 intersection_map[in.first] = in.second;
384 }
385
386 std::vector<std::pair<Index_, Index_> > intersection_in_universe;
387 intersection_in_universe.reserve(output.universe.size());
388 auto& cur_available = output.available[ref_i];
389 cur_available.reserve(output.universe.size());
390
391 for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {
392 auto it = intersection_map.find(output.universe[i]);
393 if (it != intersection_map.end()) {
394 intersection_in_universe.emplace_back(it->second, i); // using 'i' as we want to work with indices into 'universe', not the indices of the universe itself.
395 cur_available.insert(i);
396 }
397 }
398 std::sort(intersection_in_universe.begin(), intersection_in_universe.end());
399
400 std::vector<Index_> to_extract;
401 to_extract.reserve(intersection_in_universe.size());
402 for (const auto& p : intersection_in_universe) {
403 to_extract.push_back(p.first);
404 }
405 tatami::VectorPtr<Index_> to_extract_ptr(tatami::VectorPtr<Index_>{}, &to_extract);
406
407 tatami::parallelize([&](int, Index_ start, Index_ len) {
408 std::vector<Value_> buffer(to_extract.size());
409 internal::RankedVector<Value_, Index_> tmp_ranked;
410 tmp_ranked.reserve(to_extract.size());
411 auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, to_extract_ptr);
412
413 for (size_t c = start, end = start + len; c < end; ++c) {
414 auto ptr = ext->fetch(buffer.data());
415
416 tmp_ranked.clear();
417 for (const auto& p : intersection_in_universe) {
418 tmp_ranked.emplace_back(*ptr, p.second); // remember, 'p.second' corresponds to indices into the universe.
419 ++ptr;
420 }
421 std::sort(tmp_ranked.begin(), tmp_ranked.end());
422
425 }
426 }, ref.ncol(), options.num_threads);
427 }
428}
429
430template<typename Value_, typename Index_, typename Inputs_>
431TrainedIntegrated<Index_> train_integrated(Inputs_& inputs, const TrainIntegratedOptions& options) {
433 size_t nrefs = inputs.size();
434 output.check_availability.resize(nrefs);
435 output.available.resize(nrefs);
436 output.markers.resize(nrefs);
437 output.ranked.resize(nrefs);
438
439 // Checking that the number of genes in the test dataset are consistent.
440 output.test_nrow = -1;
441 for (const auto& in : inputs) {
442 if (output.test_nrow == static_cast<Index_>(-1)) {
443 output.test_nrow = in.test_nrow;
444 } else if (in.test_nrow != static_cast<Index_>(-1) && in.test_nrow != output.test_nrow) {
445 throw std::runtime_error("inconsistent number of rows in the test dataset across entries of 'inputs'");
446 }
447 }
448
449 // Identify the union of all marker genes.
450 std::unordered_map<Index_, Index_> remap_to_universe;
451 std::unordered_set<Index_> subset_tmp;
452 for (const auto& in : inputs) {
453 for (const auto& mrk : in.markers) {
454 subset_tmp.insert(mrk.begin(), mrk.end());
455 }
456 }
457
458 output.universe.insert(output.universe.end(), subset_tmp.begin(), subset_tmp.end());
459 std::sort(output.universe.begin(), output.universe.end());
460 remap_to_universe.reserve(output.universe.size());
461 for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {
462 remap_to_universe[output.universe[i]] = i;
463 }
464
465 for (size_t r = 0; r < nrefs; ++r) {
467 }
468
469 return output;
470}
471
472}
487template<typename Value_, typename Index_, typename Label_>
489 return internal::train_integrated<Value_, Index_>(inputs, options);
490}
491
502template<typename Value_, typename Index_, typename Label_>
504 return internal::train_integrated<Value_, Index_>(inputs, options);
505}
506
507}
508
509#endif
Create an intersection of genes.
Classifier that integrates multiple reference datasets.
Definition train_integrated.hpp:240
size_t num_labels(size_t r) const
Definition train_integrated.hpp:253
size_t num_profiles(size_t r) const
Definition train_integrated.hpp:261
size_t num_references() const
Definition train_integrated.hpp:245
Common definitions for singlepp.
Cell type classification using the SingleR algorithm in C++.
Definition classify_single.hpp:19
Intersection< Index_ > intersect_genes(Index_ test_nrow, const Id_ *test_id, Index_ ref_nrow, const Id_ *ref_id)
Definition Intersection.hpp:54
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input(const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingle< Index_, Float_ > &trained)
Definition train_integrated.hpp:75
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input_intersect(Index_ test_nrow, const Intersection< Index_ > &intersection, const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingleIntersect< Index_, Float_ > &trained)
Definition train_integrated.hpp:135
std::vector< std::vector< std::vector< Index_ > > > Markers
Definition Markers.hpp:40
TrainedIntegrated< Index_ > train_integrated(const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &inputs, const TrainIntegratedOptions &options)
Definition train_integrated.hpp:488
Input to train_integrated().
Definition train_integrated.hpp:34
Options for train_integrated().
Definition train_integrated.hpp:290
int num_threads
Definition train_integrated.hpp:295
Train a classifier from a single reference.