GLnexus
Scalable datastore for population genome sequencing, with on-demand joint genotyping
 All Classes Functions Variables
types.h
1 #ifndef GLNEXUS_TYPES_H
2 #define GLNEXUS_TYPES_H
3 
4 #include <string>
5 #include <vector>
6 #include <map>
7 #include <set>
8 #include <memory>
9 #include <stdexcept>
10 #include <sstream>
11 #include <iomanip>
12 #include <iostream>
13 #include <assert.h>
14 #include <vcf.h>
15 #include <mutex>
16 #include "yaml-cpp/yaml.h"
17 
18 #define UNPAIR(p,nm1,nm2) auto nm1 = (p).first; auto nm2 = (p).second;
19 template<typename T> inline void ignore_retval(T) {}
20 
21 namespace GLnexus {
22 
23 enum class StatusCode { OK, FAILURE, INVALID, NOT_FOUND, EXISTS, IO_ERROR, NOT_IMPLEMENTED };
24 
26 
30 class Status {
31  StatusCode code_;
32  const char *msg_;
33  std::unique_ptr<std::string> noun_;
34 
35 public:
36 
38 
40  Status(StatusCode code = StatusCode::OK, const char* msg = nullptr) noexcept
41  : code_(code), msg_(msg) {}
42 
43 
45 
48  Status(StatusCode code, const char* msg, const std::string& noun)
49  : code_(code), msg_(msg) {
50  noun_ = std::make_unique<std::string>(noun);
51  }
52 
53  bool ok() const noexcept { return code_ == StatusCode::OK; }
54  bool bad() const noexcept { return !ok(); }
55  operator StatusCode() const noexcept { return code_; }
56  operator int() const noexcept { return static_cast<int>(code_); }
57 
58  static Status OK() noexcept { return Status(StatusCode::OK); }
59 
60  #define STATUS_SUGAR(nm,code) \
61  static Status nm(const char* msg = nullptr) noexcept { return Status(code, msg); } \
62  static Status nm(const char *msg, const std::string& noun) { return Status(code, msg, noun); }
63 
64  STATUS_SUGAR(Failure,StatusCode::FAILURE)
65  STATUS_SUGAR(Invalid,StatusCode::INVALID)
66  STATUS_SUGAR(NotFound,StatusCode::NOT_FOUND)
67  STATUS_SUGAR(Exists,StatusCode::EXISTS)
68  STATUS_SUGAR(IOError,StatusCode::IO_ERROR)
69  STATUS_SUGAR(NotImplemented,StatusCode::NOT_IMPLEMENTED)
70 
71  std::string str() const {
72  std::ostringstream ans;
73  switch (code_) {
74  case StatusCode::OK: ans << "OK"; break;
75  case StatusCode::INVALID: ans << "Invalid"; break;
76  case StatusCode::NOT_FOUND: ans << "NotFound"; break;
77  case StatusCode::EXISTS: ans << "Exists"; break;
78  case StatusCode::IO_ERROR: ans << "IOError"; break;
79  case StatusCode::NOT_IMPLEMENTED: ans << "NotImplemented"; break;
80  default: ans << "Failure";
81  }
82  if (msg_) {
83  ans << ": " << msg_;
84  if (noun_) {
85  ans << " (" << *noun_ << ")";
86  }
87  }
88  return ans.str();
89  }
90 };
91 
92 // Convenience macro for re-raising a bad status when no recovery/cleanup is needed
93 #define S(st) s = st; if (s.bad()) return s;
94 
96 bool is_dna(const std::string&);
97 
99 struct range {
100  int rid=-1, beg=-1, end=-1;
101 
102  range(int rid_, int beg_, int end_) noexcept : rid(rid_), beg(beg_), end(end_) {
103  if (beg_ > end_) {
104  throw std::invalid_argument("invalid range (beginning > end)");
105  }
106  }
107 
112  range(const bcf1_t* bcf) noexcept {
113  assert(bcf != nullptr);
114  rid = bcf->rid;
115  beg = bcf->pos;
116  end = bcf->pos+bcf->rlen;
117  }
118 
119  range(const std::shared_ptr<const bcf1_t>& bcf) noexcept : range(bcf.get()) {}
120 
121  size_t size() const noexcept { return end-beg; }
122 
123  bool operator==(const range& r) const noexcept { return rid == r.rid && beg == r.beg && end == r.end; }
124  bool operator!=(const range& r) const noexcept { return !(*this == r); }
125  bool operator<(const range& r) const noexcept {
126  return rid < r.rid || (rid == r.rid && (beg < r.beg || (beg == r.beg && end < r.end)));
127  }
128  bool operator<=(const range& r) const noexcept { return *this < r || *this == r; }
129 
130  bool overlaps(const range& r) const noexcept { return rid == r.rid && end > r.beg && beg < r.end; }
131  bool within(const range& r) const noexcept { return rid == r.rid && beg >= r.beg && end <= r.end; }
132  bool contains(const range& r) const noexcept { return r.within(*this); }
133 
134  std::unique_ptr<range> intersect(const range& r) const {
135  if (!overlaps(r)) return nullptr;
136  return std::make_unique<range>(rid, std::max(beg,r.beg), std::min(end,r.end));
137  }
138 
139  std::string str(const std::vector<std::pair<std::string,size_t> >& contigs) const {
140  std::ostringstream os;
141  if (rid >= 0 && rid < (int)contigs.size()) {
142  os << std::get<0>(contigs[rid]);
143  } else {
144  os << '<' << rid << '>';
145  }
146  os << ':' << (beg+1) << '-' << end;
147  return os.str();
148  }
149  std::string str() const {
150  return str({});
151  }
152 };
153 
154 struct allele {
155  range pos;
156  std::string dna;
157 
158  allele(const range& pos_, const std::string& dna_) : pos(pos_), dna(dna_) {
159  // Note; dna.size() may not equal pos.size(), for indel alleles
160  if (!is_dna(dna)) throw std::invalid_argument("allele(): invalid DNA " + dna);
161  }
162 
164  bool operator==(const allele& rhs) const noexcept { return pos == rhs.pos && dna == rhs.dna; }
165 
167  bool operator<(const allele& rhs) const noexcept { return pos < rhs.pos || (pos == rhs.pos && dna < rhs.dna); }
168  bool operator<=(const allele& rhs) const noexcept { return *this < rhs || *this == rhs; }
169 
170  std::string str() const {
171  std::ostringstream os;
172  os << pos.str() << "(" << dna << ")";
173  return os.str();
174  }
175 };
176 
177 struct discovered_allele_info {
178  bool is_ref;
179  float observation_count;
180 
181  bool operator==(const discovered_allele_info& rhs) const noexcept { return is_ref == rhs.is_ref && observation_count == rhs.observation_count; }
182 
183  std::string str() const {
184  std::ostringstream os;
185  os << "[ is_ref: " << std::boolalpha << is_ref << " observation count: " << std::setprecision(1) << observation_count << "]";
186  return os.str();
187  }
188 };
189 using discovered_alleles = std::map<allele,discovered_allele_info>;
190 Status merge_discovered_alleles(const discovered_alleles& src, discovered_alleles& dest);
191 Status yaml_of_discovered_alleles(const discovered_alleles&,
192  const std::vector<std::pair<std::string,size_t> >&,
193  YAML::Emitter&);
194 Status discovered_alleles_of_yaml(const YAML::Node&,
195  const std::vector<std::pair<std::string,size_t> >&,
196  discovered_alleles&);
197 
198 struct unified_site {
199  range pos;
200 
201 
203 
206  std::vector<std::string> alleles;
207 
208 
211  std::map<allele,int> unification;
212 
213  std::vector<float> observation_count;
214  //std::vector<float> genotype_prior;
215 
216  bool operator==(const unified_site& rhs) const noexcept {
217  return pos == rhs.pos && alleles == rhs.alleles && unification == rhs.unification
218  && observation_count == rhs.observation_count;
219  }
220  bool operator<(const unified_site& rhs) const noexcept{
221  if (pos != rhs.pos) return pos < rhs.pos;
222  if (alleles != rhs.alleles) return alleles < rhs.alleles;
223  if (unification != rhs.unification) return unification < rhs.unification;
224  return observation_count < rhs.observation_count;
225  }
226 
227  unified_site(const range& pos_) noexcept : pos(pos_) {}
228 
229  Status yaml(const std::vector<std::pair<std::string,size_t> >& contigs,
230  YAML::Emitter& out) const;
231  static Status of_yaml(const YAML::Node& yaml,
232  const std::vector<std::pair<std::string,size_t> >& contigs,
233  unified_site& ans);
234 };
235 
236 struct loss_stats {
237  int n_calls_total=0, n_bp_total=0;
238  int n_gvcf_calls_total=0, n_gvcf_bp_total=0;
239  int n_no_calls_total = 0;
240 
241  // captures loss of both vcf entries and gvcf entries
242  int n_calls_lost=0, n_bp_lost=0;
243 
244  // loss specific to gvcf entries
245  int n_gvcf_calls_lost=0, n_gvcf_bp_lost=0;
246 
247  // Merges another loss_stats and increment the count
248  // variables accordingly
249  void operator+=(const loss_stats& loss) {
250  n_calls_total += loss.n_calls_total;
251  n_bp_total += loss.n_bp_total;
252 
253  n_gvcf_calls_total += loss.n_gvcf_calls_total;
254  n_gvcf_bp_total += loss.n_gvcf_bp_total;
255 
256  n_calls_lost += loss.n_calls_lost;
257  n_bp_lost += loss.n_bp_lost;
258 
259  n_gvcf_calls_lost += loss.n_gvcf_calls_lost;
260  n_gvcf_bp_lost += loss.n_gvcf_bp_lost;
261 
262  n_no_calls_total += loss.n_no_calls_total;
263  }
264 
265  std::string str() const noexcept {
266  std::ostringstream ans;
267 
268  ans << n_no_calls_total << " no call(s).\n";
269 
270  // stop here if no no calls
271  if (!n_no_calls_total)
272  return ans.str();
273 
274  ans << "This is made up of a loss of " << n_calls_lost << " original call(s) which cover " << n_bp_lost << " bp.\n";
275  ans << "The loss is " << std::setprecision(3) << prop_calls_lost() << "% of " << n_calls_total << " calls; or " << prop_bp_lost() << "% of " << n_bp_total << " bp processed from the original dataset(s).\n";
276 
277  ans << "Looking at gvcf entries, there is a loss of " << n_gvcf_calls_lost << " call(s) which cover " << n_gvcf_bp_lost << " bp.\n";
278  ans << "The loss is " << prop_gvcf_calls_lost() << "% of " << n_gvcf_calls_total << " calls; or " << prop_gvcf_bp_lost() << "% of " << n_gvcf_bp_total << " bp of gvcf entries processed.\n";
279 
280 
281  return ans.str();
282  }
283 
284  // Returns proportion of calls lost as a percentage
285  float prop_calls_lost() const noexcept {
286  if (!n_calls_total) return 0;
287  return n_calls_lost / (float) n_calls_total * 100;
288  }
289 
290  // Returns proportion of bp coverage lost as a percentage
291  float prop_bp_lost() const noexcept {
292  if (!n_bp_total) return 0;
293  return n_bp_lost / (float) n_bp_total * 100;
294  }
295 
296  // Returns proportion of gvcf bp coverage lost as a percentage
297  float prop_gvcf_bp_lost() const noexcept {
298  if (!n_gvcf_bp_total) return 0;
299  return n_gvcf_bp_lost / (float) n_gvcf_bp_total * 100;
300  }
301 
302  // Returns proportion of gvcf calls lost as a percentage
303  float prop_gvcf_calls_lost() const noexcept {
304  if (!n_gvcf_calls_total) return 0;
305  return n_gvcf_calls_lost / (float) n_gvcf_calls_total * 100;
306  }
307 };
308 
309 // per-sample loss_stats
310 using consolidated_loss = std::map<std::string, loss_stats>;
311 Status merge_loss_stats(const consolidated_loss& src, consolidated_loss& dest);
312 
313 // Statistics collected during range queries
314 struct StatsRangeQuery {
315  int64_t nBCFRecordsRead; // how many BCF records were read from the DB
316  int64_t nBCFRecordsInRange; // how many were in the requested range
317 
318  // constructor
319  StatsRangeQuery() {
320  nBCFRecordsRead = 0;
321  nBCFRecordsInRange = 0;
322  }
323 
324  // copy constructor
325  StatsRangeQuery(const StatsRangeQuery &srq) {
326  nBCFRecordsRead = srq.nBCFRecordsRead;
327  nBCFRecordsInRange = srq.nBCFRecordsInRange;
328  }
329 
330  // Addition
331  StatsRangeQuery& operator+=(const StatsRangeQuery& srq) {
332  nBCFRecordsRead += srq.nBCFRecordsRead;
333  nBCFRecordsInRange += srq.nBCFRecordsInRange;
334  return *this;
335  }
336 
337  // return a human readable string
338  std::string str() {
339  std::ostringstream os;
340  os << "Num BCF records read " << std::to_string(nBCFRecordsRead)
341  << " query hits " << std::to_string(nBCFRecordsInRange);
342  return os.str();
343  }
344 };
345 
346 } //namespace GLnexus
347 
348 #endif
Genomic range (chromosome id, begin coordinate, end coordinate)
Definition: types.h:99
Status(StatusCode code=StatusCode::OK, const char *msg=nullptr) noexcept
Default Status constructor.
Definition: types.h:40
Function status (return) codes.
Definition: types.h:30
range(const bcf1_t *bcf) noexcept
Definition: types.h:112
Status(StatusCode code, const char *msg, const std::string &noun)
Extended Status constructor.
Definition: types.h:48