Minimac4
Loading...
Searching...
No Matches
dosage_writer.hpp
Go to the documentation of this file.
1#ifndef MINIMAC4_DOSAGE_WRITER_HPP
2#define MINIMAC4_DOSAGE_WRITER_HPP
3
4#include "variant.hpp"
6#include "hidden_markov_model.hpp" // TODO: change imputed data class
7
8#include <savvy/reader.hpp>
9#include <savvy/writer.hpp>
10
11#include <memory>
12
25{
26private:
31 struct accuracy_statistics
32 {
33 double er2_sum = 0.;
34 std::size_t n_var = 0;
35 };
36
37 savvy::writer out_file_;
38 std::unique_ptr<savvy::writer> emp_out_file_;
39 std::unique_ptr<savvy::writer> sites_out_file_;
40 std::unordered_set<std::string> fmt_field_set_;
41 std::vector<accuracy_statistics> accuracy_stats_;
42 std::size_t n_samples_ = 0;
43 float min_r2_ = -1.f;
44 bool is_temp_file_;
45
46 // buffers
47 savvy::compressed_vector<std::int8_t> sparse_gt_;
48 std::vector<float> dense_float_vec_;
49 std::vector<float> dense_zero_vec_;
50
55 struct variant_update_ctx
56 {
57 savvy::compressed_vector<std::int8_t> sparse_gt;
58 savvy::compressed_vector<float> sparse_dosages;
59 std::vector<float> gp_vec;
60 };
61public:
76 dosage_writer(const std::string& file_path, const std::string& emp_file_path, const std::string& sites_file_path,
77 savvy::file::format file_format,
78 std::uint8_t out_compression,
79 const std::vector<std::string>& sample_ids,
80 const std::vector<std::string>& fmt_fields,
81 const std::string& chromosome,
82 float min_r2, bool is_temp);
83
116 bool merge_temp_files(std::list<savvy::reader>& temp_files, std::list<savvy::reader>& temp_emp_files);
117 bool merge_temp_files(std::list<std::string>& temp_file_paths, std::list<std::string>& temp_emp_file_paths);
118
211
212 bool write_dosages(const full_dosages_results& hmm_results, const std::vector<target_variant>& tar_variants, const std::vector<target_variant>& tar_only_variants, std::pair<std::size_t, std::size_t> observed_range, const reduced_haplotypes& full_reference_data, const savvy::region& impute_region);
213
244 void print_mean_er2(std::ostream& os) const;
245
246private:
247
281 static bool mean_impute(savvy::compressed_vector<float>& sparse_dosages);
282
368 static std::vector<std::pair<std::string, std::string>> gen_headers(const std::vector<std::string>& fmt_fields, const std::string& chromosome, bool is_temp);
369
435 static std::vector<std::pair<std::string, std::string>> gen_emp_headers(const std::string& chromosome);
436
487 static savvy::file::format format_from_filename(const std::string& filename, savvy::file::format default_format);
488
540 static int clevel_from_filename(const std::string& filename, int default_clevel);
541
574 static bool sites_match(const target_variant& t, const reference_site_info& r);
575
612 bool has_good_r2(savvy::site_info& site);
613
653 static float calc_er2(double s_x, double s_xx, double s_y, double s_yy, double s_xy, std::size_t n);
654
689 static float calc_r2(double s_x, double s_xx, std::size_t n);
690
746 void set_info_fields(savvy::variant& out_var, const savvy::compressed_vector<float>& sparse_dosages, const std::vector<float>& loo_dosages, const std::vector<std::int8_t>& observed);
747
798 void set_format_fields(savvy::variant& out_var, savvy::compressed_vector<float>& sparse_dosages);
799
826 struct plus_ignore_missing
827 {
828 float operator()(const float& l, const float& r)
829 {
830 if (std::isnan(r))
831 return l;
832 if (std::isnan(l))
833 return r;
834 return l + r;
835 }
836
837 std::int32_t operator()(const std::int32_t& l, const std::int32_t& r)
838 {
839 if (r < 0)
840 return l;
841 if (l < 0)
842 return r;
843 return l + r;
844 }
845 };
846};
847
848#endif // MINIMAC4_DOSAGE_WRITER_HPP
void print_mean_er2(std::ostream &os) const
Print the mean empirical R² (ER2) values to an output stream.
Definition dosage_writer.cpp:576
bool merge_temp_files(std::list< std::string > &temp_file_paths, std::list< std::string > &temp_emp_file_paths)
dosage_writer(const std::string &file_path, const std::string &emp_file_path, const std::string &sites_file_path, savvy::file::format file_format, std::uint8_t out_compression, const std::vector< std::string > &sample_ids, const std::vector< std::string > &fmt_fields, const std::string &chromosome, float min_r2, bool is_temp)
Construct a new dosage_writer.
Definition dosage_writer.cpp:5
bool merge_temp_files(std::list< savvy::reader > &temp_files, std::list< savvy::reader > &temp_emp_files)
Merge temporary HDS and empirical dosage files into the final output.
Definition dosage_writer.cpp:191
bool write_dosages(const full_dosages_results &hmm_results, const std::vector< target_variant > &tar_variants, const std::vector< target_variant > &tar_only_variants, std::pair< std::size_t, std::size_t > observed_range, const reduced_haplotypes &full_reference_data, const savvy::region &impute_region)
Write imputed dosages and observed genotypes to output files.
Definition dosage_writer.cpp:413
Stores full and leave-one-out (LOO) dosages for imputed variants.
Definition hidden_markov_model.hpp:20
Represents a collection of haplotype blocks with reduced storage.
Definition unique_haplotype.hpp:339
Stores information about a site in the reference dataset.
Definition variant.hpp:40
Represents a variant in the target dataset.
Definition variant.hpp:18