singlepp/train__integrated_8hpp_source.html

#ifndef SINGLEPP_TRAIN_INTEGRATED_HPP

#define SINGLEPP_TRAIN_INTEGRATED_HPP


#include "defs.hpp"


#include "scaled_ranks.hpp"

#include "train_single.hpp"

#include "Intersection.hpp"


#include <vector>

#include <unordered_set>

#include <unordered_map>

#include <algorithm>

#include <cstdint>

#include <memory>


namespace singlepp {


template<typename Value_ = DefaultValue, typename Index_ = DefaultIndex, typename Label_ = DefaultLabel>


struct TrainIntegratedInput {

    Index_ test_nrow;


    const tatami::Matrix<Value_, Index_>* ref;


    const Label_* labels;


    std::vector<std::vector<Index_> > markers;


    bool with_intersection = false;


    const Intersection<Index_>* user_intersection = NULL;


    Intersection<Index_> auto_intersection;

};


template<typename Value_, typename Index_, typename Label_, typename Float_>


TrainIntegratedInput<Value_, Index_, Label_> prepare_integrated_input(

    const tatami::Matrix<Value_, Index_>& ref,

    const Label_* labels,

    const TrainedSingle<Index_, Float_>& trained)

{

    TrainIntegratedInput<Value_, Index_, Label_> output;

    output.test_nrow = ref.nrow(); // remember, test and ref are assumed to have the same features.

    output.ref = &ref;

    output.labels = labels;


    const auto& subset = trained.get_subset();

    const auto& old_markers = trained.get_markers();

    auto nlabels = old_markers.size();


    // Adding the markers for each label, indexed according to their

    // position in the test matrix. This assumes that 'mat_subset' is

    // appropriately specified to contain the test's row indices.

    auto& new_markers = output.markers;

    new_markers.reserve(nlabels);

    std::unordered_set<Index_> unified;


    for (decltype(nlabels) i = 0; i < nlabels; ++i) {

        unified.clear();

        for (const auto& x : old_markers[i]) {

            unified.insert(x.begin(), x.end());

        }

        new_markers.emplace_back(unified.begin(), unified.end());

        auto& cur_new_markers = new_markers.back();

        for (auto& y : cur_new_markers) {

            y = subset[y];

        }

    }


    return output;

}


template<typename Index_, typename Value_, typename Label_, typename Float_>


TrainIntegratedInput<Value_, Index_, Label_> prepare_integrated_input_intersect(

    Index_ test_nrow,

    const Intersection<Index_>& intersection,

    const tatami::Matrix<Value_, Index_>& ref,

    const Label_* labels,

    const TrainedSingleIntersect<Index_, Float_>& trained)

{

    TrainIntegratedInput<Value_, Index_, Label_> output;

    output.test_nrow = test_nrow;

    output.ref = &ref;

    output.labels = labels;


    // Updating the markers so that they point to rows of the test matrix.

    const auto& old_markers = trained.get_markers();

    auto nlabels = old_markers.size();

    auto& new_markers = output.markers;

    new_markers.resize(nlabels);


    const auto& test_subset = trained.get_test_subset();

    std::unordered_set<Index_> unified;


    for (decltype(nlabels) i = 0; i < nlabels; ++i) {

        const auto& cur_old_markers = old_markers[i];


        unified.clear();

        for (const auto& x : cur_old_markers) {

            unified.insert(x.begin(), x.end());

        }


        auto& cur_new_markers = new_markers[i];

        cur_new_markers.reserve(unified.size());

        for (auto y : unified) {

            cur_new_markers.push_back(test_subset[y]);

        }

    }


    output.with_intersection = true;

    output.user_intersection = &intersection;

    return output;

}


// For back-compatibility only.

template<typename Index_, typename Value_, typename Label_, typename Float_>

TrainIntegratedInput<Value_, Index_, Label_> prepare_integrated_input_intersect(

    const Intersection<Index_>& intersection,

    const tatami::Matrix<Value_, Index_>& ref,

    const Label_* labels,

    const TrainedSingleIntersect<Index_, Float_>& trained)

{

    return prepare_integrated_input_intersect<Index_, Value_, Label_, Float_>(-1, intersection, ref, labels, trained);

}

template<typename Index_, typename Id_, typename Value_, typename Label_, typename Float_>


TrainIntegratedInput<Value_, Index_, Label_> prepare_integrated_input_intersect(

    Index_ test_nrow,

    const Id_* test_id,

    const tatami::Matrix<Value_, Index_>& ref,

    const Id_* ref_id,

    const Label_* labels,

    const TrainedSingleIntersect<Index_, Float_>& trained)

{

    auto intersection = intersect_genes(test_nrow, test_id, ref.nrow(), ref_id);

    auto output = prepare_integrated_input_intersect(test_nrow, intersection, ref, labels, trained);

    output.user_intersection = NULL;

    output.auto_intersection.swap(intersection);

    return output;

}


template<typename Index_>


class TrainedIntegrated {

public:


    std::size_t num_references() const {

        return markers.size();

    }


    std::size_t num_labels(std::size_t r) const {

        return markers[r].size();

    }


    std::size_t num_profiles(std::size_t r) const {

        std::size_t n = 0;

        for (const auto& ref : ranked[r]) {

            n += ref.size();

        }

        return n;

    }


public:

    // Technically this should be private, but it's a pain to add

    // templated friend functions, so I can't be bothered.

    Index_ test_nrow;

    std::vector<Index_> universe; // To be used by classify_integrated() for indexed extraction.


    std::vector<uint8_t> check_availability;

    std::vector<std::unordered_set<Index_> > available; // indices to 'universe'

    std::vector<std::vector<std::vector<Index_> > > markers; // indices to 'universe'

    std::vector<std::vector<std::vector<internal::RankedVector<Index_, Index_> > > > ranked; // .second contains indices to 'universe'

};


struct TrainIntegratedOptions {

    int num_threads = 1;

};


namespace internal {


template<typename Value_, typename RefLabel_, typename Input_, typename Index_>

void train_integrated_per_reference(

    RefLabel_ ref_i,

    Input_& curinput,

    TrainedIntegrated<Index_>& output,

    const std::unordered_map<Index_, Index_> remap_to_universe,

    const TrainIntegratedOptions& options)

{

    auto curlab = curinput.labels;

    const auto& ref = *(curinput.ref);


    // Reindexing the markers so that they contain indices into to the universe.

    auto& curmarkers = output.markers[ref_i];

    if constexpr(std::is_const<Input_>::value) {

        curmarkers.swap(curinput.markers);

    } else {

        curmarkers = curinput.markers;

    }

    for (auto& outer : curmarkers) {

        for (auto& x : outer) {

            x = remap_to_universe.find(x)->second;

        }

    }


    // Pre-allocating the vectors of pre-ranked expression.

    auto& cur_ranked = output.ranked[ref_i];

    std::vector<Index_> positions;

    {

        auto nlabels = curmarkers.size();

        Index_ NC = ref.ncol();

        positions.reserve(NC);


        std::vector<Index_> samples_per_label(nlabels);

        for (Index_ c = 0; c < NC; ++c) {

            auto& pos = samples_per_label[curlab[c]];

            positions.push_back(pos);

            ++pos;

        }


        cur_ranked.resize(nlabels);

        for (decltype(nlabels) l = 0; l < nlabels; ++l) {

            cur_ranked[l].resize(samples_per_label[l]);

        }

    }


    if (!curinput.with_intersection) {

        // The universe is guaranteed to be sorted and unique, see its derivation

        // in internal::train_integrated() below. This means we can directly use it

        // for indexed extraction from a tatami::Matrix.

        tatami::VectorPtr<Index_> universe_ptr(tatami::VectorPtr<Index_>{}, &(output.universe));


        tatami::parallelize([&](int, Index_ start, Index_ len) {

            std::vector<Value_> buffer(output.universe.size());

            internal::RankedVector<Value_, Index_> tmp_ranked;

            tmp_ranked.reserve(output.universe.size());

            auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, universe_ptr);


            for (Index_ c = start, end = start + len; c < end; ++c) {

                auto ptr = ext->fetch(buffer.data());


                tmp_ranked.clear();

                for (int i = 0, end = output.universe.size(); i < end; ++i, ++ptr) {

                    tmp_ranked.emplace_back(*ptr, i);

                }

                std::sort(tmp_ranked.begin(), tmp_ranked.end());


                auto& final_ranked = cur_ranked[curlab[c]][positions[c]];

                simplify_ranks(tmp_ranked, final_ranked);

            }

        }, ref.ncol(), options.num_threads);


    } else {

        output.check_availability[ref_i] = 1;


        // Need to remap from indices on the test matrix to those in the current reference matrix

        // so that we can form an appropriate vector for indexed tatami extraction.

        const auto& intersection = (curinput.user_intersection == NULL ? curinput.auto_intersection : *(curinput.user_intersection));

        std::unordered_map<Index_, Index_> intersection_map;

        intersection_map.reserve(intersection.size());

        for (const auto& in : intersection) {

            intersection_map[in.first] = in.second;

        }


        std::vector<std::pair<Index_, Index_> > intersection_in_universe;

        intersection_in_universe.reserve(output.universe.size());

        auto& cur_available = output.available[ref_i];

        cur_available.reserve(output.universe.size());


        for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {

            auto it = intersection_map.find(output.universe[i]);

            if (it != intersection_map.end()) {

                intersection_in_universe.emplace_back(it->second, i); // using 'i' as we want to work with indices into 'universe', not the indices of the universe itself.

                cur_available.insert(i);

            }

        }

        std::sort(intersection_in_universe.begin(), intersection_in_universe.end());


        std::vector<Index_> to_extract;

        to_extract.reserve(intersection_in_universe.size());

        for (const auto& p : intersection_in_universe) {

            to_extract.push_back(p.first);

        }

        tatami::VectorPtr<Index_> to_extract_ptr(tatami::VectorPtr<Index_>{}, &to_extract);


        tatami::parallelize([&](int, Index_ start, Index_ len) {

            std::vector<Value_> buffer(to_extract.size());

            internal::RankedVector<Value_, Index_> tmp_ranked;

            tmp_ranked.reserve(to_extract.size());

            auto ext = tatami::consecutive_extractor<false>(&ref, false, start, len, to_extract_ptr);


            for (Index_ c = start, end = start + len; c < end; ++c) {

                auto ptr = ext->fetch(buffer.data());


                tmp_ranked.clear();

                for (const auto& p : intersection_in_universe) {

                    tmp_ranked.emplace_back(*ptr, p.second); // remember, 'p.second' corresponds to indices into the universe.

                    ++ptr;

                }

                std::sort(tmp_ranked.begin(), tmp_ranked.end());


                auto& final_ranked = cur_ranked[curlab[c]][positions[c]];

                simplify_ranks(tmp_ranked, final_ranked);

            }

        }, ref.ncol(), options.num_threads);

    }

}


template<typename Value_, typename Index_, typename Inputs_>

TrainedIntegrated<Index_> train_integrated(Inputs_& inputs, const TrainIntegratedOptions& options) {

    TrainedIntegrated<Index_> output;

    auto nrefs = inputs.size();

    output.check_availability.resize(nrefs);

    output.available.resize(nrefs);

    output.markers.resize(nrefs);

    output.ranked.resize(nrefs);


    // Checking that the number of genes in the test dataset are consistent.

    output.test_nrow = -1;

    for (const auto& in : inputs) {

        if (output.test_nrow == static_cast<Index_>(-1)) {

            output.test_nrow = in.test_nrow;

        } else if (in.test_nrow != static_cast<Index_>(-1) && in.test_nrow != output.test_nrow) {

            throw std::runtime_error("inconsistent number of rows in the test dataset across entries of 'inputs'");

        }

    }


    // Identify the union of all marker genes.

    std::unordered_map<Index_, Index_> remap_to_universe;

    std::unordered_set<Index_> subset_tmp;

    for (const auto& in : inputs) {

        for (const auto& mrk : in.markers) {

            subset_tmp.insert(mrk.begin(), mrk.end());

        }

    }


    output.universe.insert(output.universe.end(), subset_tmp.begin(), subset_tmp.end());

    std::sort(output.universe.begin(), output.universe.end());

    remap_to_universe.reserve(output.universe.size());

    for (Index_ i = 0, end = output.universe.size(); i < end; ++i) {

        remap_to_universe[output.universe[i]] = i;

    }


    for (decltype(nrefs) r = 0; r < nrefs; ++r) {

        train_integrated_per_reference<Value_>(r, inputs[r], output, remap_to_universe, options);

    }


    return output;

}


}

template<typename Value_, typename Index_, typename Label_>


TrainedIntegrated<Index_> train_integrated(const std::vector<TrainIntegratedInput<Value_, Index_, Label_> >& inputs, const TrainIntegratedOptions& options) {

    return internal::train_integrated<Value_, Index_>(inputs, options);

}


template<typename Value_, typename Index_, typename Label_>


TrainedIntegrated<Index_> train_integrated(std::vector<TrainIntegratedInput<Value_, Index_, Label_> >&& inputs, const TrainIntegratedOptions& options) {

    return internal::train_integrated<Value_, Index_>(inputs, options);

}


}


#endif

Intersection.hpp
Create an intersection of genes.

singlepp::TrainedIntegrated
Classifier that integrates multiple reference datasets.
Definition train_integrated.hpp:240

singlepp::TrainedIntegrated::num_references
std::size_t num_references() const
Definition train_integrated.hpp:245

singlepp::TrainedIntegrated::num_profiles
std::size_t num_profiles(std::size_t r) const
Definition train_integrated.hpp:261

singlepp::TrainedIntegrated::num_labels
std::size_t num_labels(std::size_t r) const
Definition train_integrated.hpp:253

singlepp::TrainedSingleIntersect
Classifier built from an intersection of genes.
Definition train_single.hpp:218

singlepp::TrainedSingleIntersect::get_markers
const Markers< Index_ > & get_markers() const
Definition train_single.hpp:260

singlepp::TrainedSingleIntersect::get_test_subset
const std::vector< Index_ > & get_test_subset() const
Definition train_single.hpp:269

singlepp::TrainedSingle
Classifier trained from a single reference.
Definition train_single.hpp:89

singlepp::TrainedSingle::get_subset
const std::vector< Index_ > & get_subset() const
Definition train_single.hpp:136

singlepp::TrainedSingle::get_markers
const Markers< Index_ > & get_markers() const
Definition train_single.hpp:128

defs.hpp
Common definitions for singlepp.

singlepp
Cell type classification using the SingleR algorithm in C++.
Definition classify_single.hpp:20

singlepp::intersect_genes
Intersection< Index_ > intersect_genes(Index_ test_nrow, const Id_ *test_id, Index_ ref_nrow, const Id_ *ref_id)
Definition Intersection.hpp:54

singlepp::prepare_integrated_input
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input(const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingle< Index_, Float_ > &trained)
Definition train_integrated.hpp:75

singlepp::prepare_integrated_input_intersect
TrainIntegratedInput< Value_, Index_, Label_ > prepare_integrated_input_intersect(Index_ test_nrow, const Intersection< Index_ > &intersection, const tatami::Matrix< Value_, Index_ > &ref, const Label_ *labels, const TrainedSingleIntersect< Index_, Float_ > &trained)
Definition train_integrated.hpp:135

singlepp::Intersection
std::vector< std::pair< Index_, Index_ > > Intersection
Definition Intersection.hpp:35

singlepp::train_integrated
TrainedIntegrated< Index_ > train_integrated(const std::vector< TrainIntegratedInput< Value_, Index_, Label_ > > &inputs, const TrainIntegratedOptions &options)
Definition train_integrated.hpp:488

singlepp::TrainIntegratedInput
Input to train_integrated().
Definition train_integrated.hpp:34

singlepp::TrainIntegratedOptions
Options for train_integrated().
Definition train_integrated.hpp:290

singlepp::TrainIntegratedOptions::num_threads
int num_threads
Definition train_integrated.hpp:295

train_single.hpp
Train a classifier from a single reference.