singlepp
A C++ library for cell type classification
Loading...
Searching...
No Matches
train_integrated.hpp
Go to the documentation of this file.
1#ifndef SINGLEPP_TRAIN_INTEGRATED_HPP
2#define SINGLEPP_TRAIN_INTEGRATED_HPP
3
4#include "defs.hpp"
5
6#include "scaled_ranks.hpp"
7#include "train_single.hpp"
8#include "Intersection.hpp"
9
10#include <vector>
11#include <unordered_set>
12#include <unordered_map>
13#include <algorithm>
14#include <cstdint>
15#include <memory>
16
22namespace singlepp {
23
33template<typename Value_ = DefaultValue, typename Index_ = DefaultIndex, typename Label_ = DefaultLabel>
38 const tatami::Matrix<Value_, Index_>* ref;
39
40 const Label_* labels;
41
42 std::vector<std::vector<Index_> > markers;
43
44 bool with_intersection = false;
45
47
52};
53
72template<typename Value_, typename Index_, typename Label_, typename Float_>
74 const tatami::Matrix<Value_, Index_>& ref,
75 const Label_* labels,
77{
79 output.ref = &ref;
80 output.labels = labels;
81
82 const auto& subset = trained.get_subset();
83 const auto& old_markers = trained.get_markers();
84 size_t nlabels = old_markers.size();
85
86 // Adding the markers for each label, indexed according to their
87 // position in the test matrix. This assumes that 'mat_subset' is
88 // appropriately specified to contain the test's row indices.
89 auto& new_markers = output.markers;
90 new_markers.reserve(nlabels);
91 std::unordered_set<Index_> unified;
92
93 for (size_t i = 0; i < nlabels; ++i) {
94 unified.clear();
95 for (const auto& x : old_markers[i]) {
96 unified.insert(x.begin(), x.end());
97 }
98 new_markers.emplace_back(unified.begin(), unified.end());
99 auto& cur_new_markers = new_markers.back();
100 for (auto& y : cur_new_markers) {
101 y = subset[y];
102 }
103 }
104
105 return output;
106}
107
129template<typename Index_, typename Value_, typename Label_, typename Float_>
132 const tatami::Matrix<Value_, Index_>& ref,
133 const Label_* labels,
135{
137 output.ref = &ref;
138 output.labels = labels;
139
140 // Updating the markers so that they point to rows of the test matrix.
141 const auto& old_markers = trained.get_markers();
142 size_t nlabels = old_markers.size();
143 auto& new_markers = output.markers;
144 new_markers.resize(nlabels);
145
146 const auto& test_subset = trained.get_test_subset();
147 std::unordered_set<Index_> unified;
148
149 for (size_t i = 0; i < nlabels; ++i) {
150 const auto& cur_old_markers = old_markers[i];
151
152 unified.clear();
153 for (const auto& x : cur_old_markers) {
154 unified.insert(x.begin(), x.end());
155 }
156
158 cur_new_markers.reserve(unified.size());
159 for (auto y : unified) {
160 cur_new_markers.push_back(test_subset[y]);
161 }
162 }
163
164 output.with_intersection = true;
165 output.user_intersection = &intersection;
166 return output;
167}
168
195template<typename Index_, typename Id_, typename Value_, typename Label_, typename Float_>
198 const Id_* test_id,
199 const tatami::Matrix<Value_, Index_>& ref,
200 const Id_* ref_id,
201 const Label_* labels,
203{
206 output.user_intersection = NULL;
207 output.auto_intersection.swap(intersection);
208 return output;
209}
210
215template<typename Index_>
217public:
221 size_t num_references() const {
222 return markers.size();
223 }
224
229 size_t num_labels(size_t r) const {
230 return markers[r].size();
231 }
232
237 size_t num_profiles(size_t r) const {
238 size_t n = 0;
239 for (const auto& ref : ranked[r]) {
240 n += ref.size();
241 }
242 return n;
243 }
244
245public:
249 // Technically this should be private, but it's a pain to add
250 // templated friend functions, so I can't be bothered.
251 std::vector<Index_> universe; // To be used by classify_integrated() for indexed extraction.
252
253 std::vector<uint8_t> check_availability;
254 std::vector<std::unordered_set<Index_> > available; // indices to 'universe'
255 std::vector<std::vector<std::vector<Index_> > > markers; // indices to 'universe'
256 std::vector<std::vector<std::vector<internal::RankedVector<Index_, Index_> > > > ranked; // .second contains indices to 'universe'
260};
261
272
276namespace internal {
277
278template<typename Value_, typename Index_, typename Input_>
280 size_t ref_i,
283 const std::unordered_map<Index_, Index_> remap_to_universe,
285{
286 auto curlab = curinput.labels;
287 const auto& ref = *(curinput.ref);
288
289 // Reindexing the markers so that they contain indices into to the universe.
290 auto& curmarkers = output.markers[ref_i];
291 if constexpr(std::is_const<Input_>::value) {
292 curmarkers.swap(curinput.markers);
293 } else {
294 curmarkers = curinput.markers;
295 }
296 for (auto& outer : curmarkers) {
297 for (auto& x : outer) {
298 x = remap_to_universe.find(x)->second;
299 }
300 }
301
302 // Pre-allocating the vectors of pre-ranked expression.
303 auto& cur_ranked = output.ranked[ref_i];
304 std::vector<Index_> positions;
305 {
306 size_t nlabels = curmarkers.size();
307 Index_ NC = ref.ncol();
308 positions.reserve(NC);
309
310 std::vector<Index_> samples_per_label(nlabels);
311 for (Index_ c = 0; c < NC; ++c) {
312 auto& pos = samples_per_label[curlab[c]];
313 positions.push_back(pos);
314 ++pos;
315 }
316
317 cur_ranked.resize(nlabels);
318 for (size_t l = 0; l < nlabels; ++l) {
320 }
321 }
322
323 if (!curinput.with_intersection) {
324 // The universe is guaranteed to be sorted and unique, see its derivation
325 // in internal::train_integrated() below. This means we can directly use it
326 // for indexed extraction from a tatami::Matrix.
327 tatami::VectorPtr<Index_> universe_ptr(tatami::VectorPtr<Index_>{}, &(output.universe));
328
329 tatami::parallelize([&](int, Index_ start, Index_ len) {
330 std::vector<Value_> buffer(output.universe.size());
331 internal::RankedVector<Value_, Index_> tmp_ranked;
332 tmp_ranked.reserve(output.universe.size());
333 auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, universe_ptr);
334
335 for (Index_ c = start, end = start + len; c < end; ++c) {
336 auto ptr = ext->fetch(buffer.data());
337
338 tmp_ranked.clear();
339 for (int i = 0, end = output.universe.size(); i < end; ++i, ++ptr) {
340 tmp_ranked.emplace_back(*ptr, i);
341 }
342 std::sort(tmp_ranked.begin(), tmp_ranked.end());
343
346 }
347 }, ref.ncol(), options.num_threads);
348
349 } else {
350 output.check_availability[ref_i] = 1;
351
352 // Need to remap from indices on the test matrix to those in the current reference matrix
353 // so that we can form an appropriate vector for indexed tatami extraction.
354 const auto& intersection = (curinput.user_intersection == NULL ? curinput.auto_intersection : *(curinput.user_intersection));
355 std::unordered_map<Index_, Index_> intersection_map;
356 intersection_map.reserve(intersection.size());
357 for (const auto& in : intersection) {
358 intersection_map[in.first] = in.second;
359 }
360
361 std::vector<std::pair<Index_, Index_> > intersection_in_universe;
362 intersection_in_universe.reserve(output.universe.size());
363 auto& cur_available = output.available[ref_i];
364 cur_available.reserve(output.universe.size());
365
366 for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {
367 auto it = intersection_map.find(output.universe[i]);
368 if (it != intersection_map.end()) {
369 intersection_in_universe.emplace_back(it->second, i); // using 'i' as we want to work with indices into 'universe', not the indices of the universe itself.
370 cur_available.insert(i);
371 }
372 }
373 std::sort(intersection_in_universe.begin(), intersection_in_universe.end());
374
375 std::vector<Index_> to_extract;
376 to_extract.reserve(intersection_in_universe.size());
377 for (const auto& p : intersection_in_universe) {
378 to_extract.push_back(p.first);
379 }
380 tatami::VectorPtr<Index_> to_extract_ptr(tatami::VectorPtr<Index_>{}, &to_extract);
381
382 tatami::parallelize([&](int, Index_ start, Index_ len) {
383 std::vector<Value_> buffer(to_extract.size());
384 internal::RankedVector<Value_, Index_> tmp_ranked;
385 tmp_ranked.reserve(to_extract.size());
386 auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, to_extract_ptr);
387
388 for (size_t c = start, end = start + len; c < end; ++c) {
389 auto ptr = ext->fetch(buffer.data());
390
391 tmp_ranked.clear();
392 for (const auto& p : intersection_in_universe) {
393 tmp_ranked.emplace_back(*ptr, p.second); // remember, 'p.second' corresponds to indices into the universe.
394 ++ptr;
395 }
396 std::sort(tmp_ranked.begin(), tmp_ranked.end());
397
400 }
401 }, ref.ncol(), options.num_threads);
402 }
403}
404
405template<typename Value_, typename Index_, typename Inputs_>
406TrainedIntegrated<Index_> train_integrated(Inputs_& inputs, const TrainIntegratedOptions& options) {
408 size_t nrefs = inputs.size();
409 output.check_availability.resize(nrefs);
410 output.available.resize(nrefs);
411 output.markers.resize(nrefs);
412 output.ranked.resize(nrefs);
413
414 // Identify the union of all marker genes.
415 std::unordered_map<Index_, Index_> remap_to_universe;
416 std::unordered_set<Index_> subset_tmp;
417 for (const auto& in : inputs) {
418 for (const auto& mrk : in.markers) {
419 subset_tmp.insert(mrk.begin(), mrk.end());
420 }
421 }
422
423 output.universe.insert(output.universe.end(), subset_tmp.begin(), subset_tmp.end());
424 std::sort(output.universe.begin(), output.universe.end());
425 remap_to_universe.reserve(output.universe.size());
426 for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {
427 remap_to_universe[output.universe[i]] = i;
428 }
429
430 for (size_t r = 0; r < nrefs; ++r) {
432 }
433
434 return output;
435}
436
437}
452template<typename Value_, typename Index_, typename Label_>
454 return internal::train_integrated<Value_, Index_>(inputs, options);
455}
456
467template<typename Value_, typename Index_, typename Label_>
469 return internal::train_integrated<Value_, Index_>(inputs, options);
470}
471
472}
473
474#endif
Create an intersection of genes.
Classifier that integrates multiple reference datasets.
Definition train_integrated.hpp:216
size_t num_labels(size_t r) const
Definition train_integrated.hpp:229
size_t num_profiles(size_t r) const
Definition train_integrated.hpp:237
size_t num_references() const
Definition train_integrated.hpp:221
Common definitions for singlepp.
Cell type classification using the SingleR algorithm in C++.
Definition classify_single.hpp:19
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input_intersect(const Intersection< Index_ > &intersection, const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingleIntersect< Index_, Float_ > &trained)
Definition train_integrated.hpp:130
Intersection< Index_ > intersect_genes(Index_ test_nrow, const Id_ *test_id, Index_ ref_nrow, const Id_ *ref_id)
Definition Intersection.hpp:54
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input(const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingle< Index_, Float_ > &trained)
Definition train_integrated.hpp:73
std::vector< std::vector< std::vector< Index_ > > > Markers
Definition Markers.hpp:40
TrainedIntegrated< Index_ > train_integrated(const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &inputs, const TrainIntegratedOptions &options)
Definition train_integrated.hpp:453
Input to train_integrated().
Definition train_integrated.hpp:34
Options for train_integrated().
Definition train_integrated.hpp:265
int num_threads
Definition train_integrated.hpp:270
Train a classifier from a single reference.