Minimac4
Loading...
Searching...
No Matches
unique_haplotype.hpp
Go to the documentation of this file.
1#ifndef MINIMAC4_UNIQUE_HAPLOTYPE_HPP
2#define MINIMAC4_UNIQUE_HAPLOTYPE_HPP
3
4#include "variant.hpp"
5#include "recombination.hpp"
6
7#include <savvy/reader.hpp>
8#include <savvy/writer.hpp>
9
10#include <cstdint>
11#include <string>
12#include <vector>
13#include <deque>
14#include <limits>
15#include <cassert>
16
25{
26private:
34 std::vector<std::int64_t> unique_map_;
35
42 std::vector<std::size_t> cardinalities_;
43
51 std::vector<reference_variant> variants_;
52public:
104 bool compress_variant(const reference_site_info& site_info, const std::vector<std::int8_t>& alleles);
105
110 const std::vector<reference_variant>& variants() const { return variants_; }
111
120 const std::vector<std::int64_t>& unique_map() const { return unique_map_; }
121
129 std::size_t expanded_haplotype_size() const { return unique_map_.size(); }
130
139 std::size_t unique_haplotype_size() const { return variants_.empty() ? 0 : variants_[0].gt.size(); }
140
145 std::size_t variant_size() const { return variants_.size(); }
146
155 const std::vector<std::size_t>& cardinalities() const { return cardinalities_; }
156
171 void clear();
172
191 void trim(std::size_t min_pos, std::size_t max_pos);
192
202 void pop_variant();
203
213 void fill_cm(genetic_map_file& map_file);
214
226 void fill_cm_from_recom(double& start_cm);
227
273 bool deserialize(std::istream& is, int m3vcf_version, std::size_t n_haplotypes);
274
294 int deserialize(savvy::reader& input_file, savvy::variant& var); // return <0 is error, 0 is EOF, >0 good
295
312 bool serialize(savvy::writer& output_file);
313
324 void remove_eov();
325};
326
327
339{
340private:
341 std::vector<std::size_t> block_offsets_;
342 std::deque<unique_haplotype_block> blocks_;
343 std::size_t variant_count_ = 0;
344 std::size_t min_block_size_ = 1;
345 std::size_t max_block_size_ = std::numeric_limits<std::size_t>::max();
346 bool flush_block_ = true;
347public:
348
359 {
360 private:
361 const reduced_haplotypes* parent_;
362 std::size_t block_idx_;
363 std::size_t variant_idx_;
364 public:
365 iterator(const reduced_haplotypes& parent, std::size_t block_idx, std::size_t variant_idx) :
366 parent_(&parent),
367 block_idx_(block_idx),
368 variant_idx_(variant_idx)
369 {
370 }
371
373 {
374 ++variant_idx_;
375 assert(block_idx_ < parent_->blocks_.size());
376 if (variant_idx_ >= parent_->blocks_[block_idx_].variant_size())
377 {
378 ++block_idx_;
379 variant_idx_ = 0;
380 }
381 return *this;
382 }
383
385 {
386 iterator ret(*this);
387 ++(*this);
388 return ret;
389 }
390
392 {
393 if (variant_idx_ == 0)
394 {
395 --block_idx_;
396 if (block_idx_ < parent_->blocks_.size())
397 {
398 (*this) = iterator(*parent_, block_idx_, parent_->blocks_[block_idx_].variant_size());
399 --(*this);
400 }
401 }
402 else
403 {
404 --variant_idx_;
405 }
406
407 return *this;
408 }
409
411 {
412 iterator ret(*this);
413 --(*this);
414 return ret;
415 }
416
418 {
419 return parent_->blocks_[block_idx_].variants()[variant_idx_];
420 }
421
423 {
424 return &(parent_->blocks_[block_idx_].variants()[variant_idx_]);
425 }
426
427 std::size_t block_idx() const { return block_idx_; }
428 std::size_t block_local_idx() const { return variant_idx_; }
429 std::size_t global_idx() const { return parent_->block_offsets_[block_idx_] + variant_idx_; }
430 const std::vector<std::int64_t>& unique_map() const { return parent_->blocks_[block_idx_].unique_map(); }
431 const std::vector<std::size_t>& cardinalities() const { return parent_->blocks_[block_idx_].cardinalities(); }
432 };
433
434 iterator begin() const { return iterator(*this, 0, 0); }
435 iterator end() const { return iterator(*this, blocks_.size(), 0); }
436
438
448 reduced_haplotypes(std::size_t min_block_size, std::size_t max_block_size);
449
461 bool compress_variant(const reference_site_info& site_info, const std::vector<std::int8_t>& alleles, bool flush_block = false);
462
472 void append_block(const unique_haplotype_block& block);
473
480 void fill_cm(genetic_map_file& map_file);
481
491 float compression_ratio() const;
492
498 const std::deque<unique_haplotype_block>& blocks() const { return blocks_; }
499
505 std::size_t variant_size() const { return variant_count_; }
506};
507
519{
520 return lhs.block_idx() == rhs.block_idx() && lhs.block_local_idx() == rhs.block_local_idx();
521}
522
533{
534 return !(lhs == rhs);
535}
536
537
538
539#endif // MINIMAC4_UNIQUE_HAPLOTYPE_HPP
A reader and interpolator for genetic map files.
Definition recombination.hpp:211
Iterator for traversing variants within reduced_haplotypes.
Definition unique_haplotype.hpp:359
iterator operator++(int)
Definition unique_haplotype.hpp:384
iterator & operator--()
Definition unique_haplotype.hpp:391
const std::vector< std::size_t > & cardinalities() const
Definition unique_haplotype.hpp:431
iterator & operator++()
Definition unique_haplotype.hpp:372
iterator operator--(int)
Definition unique_haplotype.hpp:410
const reference_variant * operator->() const
Definition unique_haplotype.hpp:422
std::size_t global_idx() const
Definition unique_haplotype.hpp:429
iterator(const reduced_haplotypes &parent, std::size_t block_idx, std::size_t variant_idx)
Definition unique_haplotype.hpp:365
const reference_variant & operator*() const
Definition unique_haplotype.hpp:417
const std::vector< std::int64_t > & unique_map() const
Definition unique_haplotype.hpp:430
std::size_t block_idx() const
Definition unique_haplotype.hpp:427
std::size_t block_local_idx() const
Definition unique_haplotype.hpp:428
reduced_haplotypes()
Definition unique_haplotype.hpp:437
float compression_ratio() const
Calculates the overall compression ratio of all haplotype blocks.
Definition unique_haplotype.cpp:546
void fill_cm(genetic_map_file &map_file)
Fills the centimorgan (cM) values for all variants in all blocks using the provided genetic map.
Definition unique_haplotype.cpp:540
bool compress_variant(const reference_site_info &site_info, const std::vector< std::int8_t > &alleles, bool flush_block=false)
Compresses a variant into the current reduced haplotype block.
Definition unique_haplotype.cpp:482
std::size_t variant_size() const
Returns the total number of variants across all blocks.
Definition unique_haplotype.hpp:505
iterator begin() const
Definition unique_haplotype.hpp:434
iterator end() const
Definition unique_haplotype.hpp:435
void append_block(const unique_haplotype_block &block)
Appends a new haplotype block to the collection of reduced haplotype blocks.
Definition unique_haplotype.cpp:515
const std::deque< unique_haplotype_block > & blocks() const
Accesses the stored haplotype blocks.
Definition unique_haplotype.hpp:498
Represents a block of unique haplotypes and their variants.
Definition unique_haplotype.hpp:25
const std::vector< std::int64_t > & unique_map() const
Get the mapping from haplotypes to unique columns.
Definition unique_haplotype.hpp:120
std::size_t expanded_haplotype_size() const
Get the number of original haplotypes after expansion.
Definition unique_haplotype.hpp:129
void fill_cm(genetic_map_file &map_file)
Fills the centimorgan (cM) values for all variants in the haplotype block.
Definition unique_haplotype.cpp:162
void pop_variant()
Removes the last variant from the haplotype block.
Definition unique_haplotype.cpp:157
const std::vector< std::size_t > & cardinalities() const
Get the haplotype cardinalities across unique columns.
Definition unique_haplotype.hpp:155
void trim(std::size_t min_pos, std::size_t max_pos)
Trims variants outside a specified genomic range.
Definition unique_haplotype.cpp:130
bool compress_variant(const reference_site_info &site_info, const std::vector< std::int8_t > &alleles)
Compress and map haplotype alleles for a new variant into the block.
Definition unique_haplotype.cpp:10
std::size_t unique_haplotype_size() const
Get the number of unique haplotypes represented in the block.
Definition unique_haplotype.hpp:139
std::size_t variant_size() const
Get the number of variants compressed in this block.
Definition unique_haplotype.hpp:145
const std::vector< reference_variant > & variants() const
Get the list of compressed reference variants in this block.
Definition unique_haplotype.hpp:110
void remove_eov()
Removes "end-of-vector" markers from the unique haplotype map.
Definition unique_haplotype.cpp:283
void clear()
Clears all data stored in the haplotype block.
Definition unique_haplotype.cpp:123
void fill_cm_from_recom(double &start_cm)
Fills missing centimorgan (cM) values for variants using recombination probabilities.
Definition unique_haplotype.cpp:168
bool deserialize(std::istream &is, int m3vcf_version, std::size_t n_haplotypes)
Deserialize a unique haplotype block from an input stream (m3vcf format).
Definition unique_haplotype.cpp:300
bool serialize(savvy::writer &output_file)
Serializes the unique haplotype block to a SAVVY output file.
Definition unique_haplotype.cpp:181
Stores information about a site in the reference dataset.
Definition variant.hpp:40
Extends reference_site_info to include genotype data and allele counts.
Definition variant.hpp:76
bool operator==(const reduced_haplotypes::iterator &lhs, const reduced_haplotypes::iterator &rhs)
Compares two iterators for equality.
Definition unique_haplotype.hpp:518
bool operator!=(const reduced_haplotypes::iterator &lhs, const reduced_haplotypes::iterator &rhs)
Compares two iterators for inequality.
Definition unique_haplotype.hpp:532