Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
insertionmodel.h
1 //
2 // Created by Vadim N. on 21/06/2015.
3 //
4 
5 #ifndef YMIR_INSERTIONMODEL_H
6 #define YMIR_INSERTIONMODEL_H
7 
8 #include <random>
9 #include <string>
10 
11 #include "types.h"
12 #include "tools.h"
13 
14 
15 namespace ymir {
16 
17  class AbstractInsertionModel;
18  class MonoNucInsertionModel;
19  class DiNucInsertionModel;
20 
21 
28 
29  public:
30 
31 
32  typedef unique_ptr<prob_t[]> prob_array_t;
33 
34 
36  {
37  }
38 
39 
40  AbstractInsertionModel(uint8_t size, prob_t err_prob = 0)
41  : _size(size),
42  _err_prob(err_prob),
43  _arr(new prob_t[size])
44  {
45  }
46 
47 
48  AbstractInsertionModel(uint8_t size, std::vector<prob_t>::const_iterator start, prob_t err_prob = 0)
49  : AbstractInsertionModel(size, err_prob)
50  {
51  this->updateProbabilities(start);
52  }
53 
54 
56  : _size(other._size),
57  _arr(new prob_t[other._size]),
58  _err_prob(other._err_prob)
59  {
60  this->updateProbabilities(other._arr.get());
61  }
62 
63 
64  AbstractInsertionModel& operator=(const AbstractInsertionModel &other)
65  {
66  _err_prob = other._err_prob;
67  _size = other._size;
68  if (_arr.get() != other._arr.get()) {
69  _arr.reset(new prob_t[_size]);
70  this->updateProbabilities(other._arr.get());
71  }
72  return *this;
73  }
74 
75 
76  virtual ~AbstractInsertionModel()
77  {
78  }
79 
80 
84  virtual prob_t nucProbability(const std::string& sequence,
86  char first_char = NULL_CHAR,
87  bool with_errors = false) const = 0;
88 
89  virtual prob_t nucProbability(std::string::const_iterator start,
90  seq_len_t sequence_len,
91  char first_char = NULL_CHAR,
92  bool with_errors = false) const = 0;
93 
94  virtual prob_t nucProbability(std::string::const_reverse_iterator start,
95  seq_len_t sequence_len,
96  char first_char = NULL_CHAR,
97  bool with_errors = false) const = 0;
99 
100 
104  virtual prob_t aaProbability(const std::string& sequence,
106  seq_len_t first_nuc_pos, // or an index of the codon?
107  seq_len_t last_nuc_pos,
108  char first_char = NULL_CHAR) const = 0;
109 
110  virtual prob_t aaProbability(std::string::const_iterator start,
111  seq_len_t first_nuc_pos,
112  seq_len_t last_nuc_pos,
113  char first_char = NULL_CHAR) const = 0;
114 
115  virtual prob_t aaProbability(std::string::const_reverse_iterator start,
116  seq_len_t first_nuc_pos,
117  seq_len_t last_nuc_pos,
118  char first_char = NULL_CHAR) const = 0;
120 
121 
125  virtual sequence_t generate(seq_len_t len, std::default_random_engine &rg, char first_char = NULL_CHAR, bool reverse = false) const = 0;
126 
127 
131  prob_t operator[](uint8_t index) const { return _arr[index]; }
134 
135 
136  prob_t err_prob() const { return _err_prob; }
137 
138  protected:
139 
140 // InsertionModelType _type;
141  prob_array_t _arr;
142  prob_t _err_prob;
143  uint8_t _size;
144 
145 
156  void updateProbabilities(std::vector<prob_t>::const_iterator start) {
158  for (uint8_t i = 0; i < _size; ++i, ++start) {
159  _arr[i] = *start;
160  }
161  }
162 
163  void updateProbabilities(prob_t *start) {
164  for (uint8_t i = 0; i < _size; ++i) {
165  _arr[i] = *(start + i);
166  }
167  }
168 
169  void updateProbabilities(std::vector<prob_t>::const_iterator start, prob_t err_prob) {
170  this->updateProbabilities(start);
171  _err_prob = err_prob;
172  }
173 
174  void updateProbabilities(prob_t *start, prob_t err_prob) {
175  this->updateProbabilities(start);
176  _err_prob = err_prob;
177  }
179 
180  };
181 
182 
184  public:
185 
187  : AbstractInsertionModel(4, 0)
188  {
189  }
190 
191 
192  MonoNucInsertionModel(prob_t err_prob)
193  : AbstractInsertionModel(4, err_prob)
194  {
195  }
196 
197 
198  MonoNucInsertionModel(std::vector<prob_t>::const_iterator start, prob_t err_prob = 0)
199  : AbstractInsertionModel(4, err_prob)
200  {
201  this->updateProbabilities(start);
202  }
203 
204 
206  : AbstractInsertionModel(other)
207  {
208  }
209 
210 
211  MonoNucInsertionModel& operator=(const MonoNucInsertionModel &other) {
212  AbstractInsertionModel::operator=(other);
213 // _err_prob = other._err_prob;
214 // if (_arr.get() != other._arr.get()) {
215 // _arr.reset(new prob_t[4]);
216 // this->updateProbabilities(other._arr.get());
217 // }
218  return *this;
219  }
220 
221 
222  prob_t nucProbability(const std::string& sequence,
223  char first_char = NULL_CHAR,
224  bool with_errors = false) const
225  {
226  return this->nucProbability(sequence.cbegin(), sequence.size(), first_char, with_errors);
227  }
228 
229  prob_t nucProbability(std::string::const_iterator start,
230  seq_len_t sequence_len,
231  char first_char = NULL_CHAR,
232  bool with_errors = false) const
233  {
234  prob_t res = 1;
235 
236  if (sequence_len) {
237  auto tmp = start;
238  if (!with_errors) {
239  for (seq_len_t i = 0; i < sequence_len; ++i, ++start) {
240  res *= _arr[nuc_hash(*start)];
241  }
242  } else {
243  for (seq_len_t i = 0; i < sequence_len; ++i, ++start) {
244  res *= (_err_prob + (1 - _err_prob) * _arr[nuc_hash(*start)]);
245  }
246  }
247  }
248 
249  return res;
250  }
251 
252  prob_t nucProbability(std::string::const_reverse_iterator start,
253  seq_len_t sequence_len,
254  char first_char = NULL_CHAR,
255  bool with_errors = false) const
256  {
257  prob_t res = 1;
258 
259  if (sequence_len) {
260  auto tmp = start;
261  if (!with_errors) {
262  for (seq_len_t i = 0; i < sequence_len; ++i, ++start) {
263  res *= _arr[nuc_hash(*start)];
264  }
265  } else {
266  for (seq_len_t i = 0; i < sequence_len; ++i, ++start) {
267  res *= (_err_prob + (1 - _err_prob) * _arr[nuc_hash(*start)]);
268  }
269  }
270  }
271 
272  return res;
273  }
274 
275 
279  prob_t aaProbability(const std::string& sequence,
281  char first_char = NULL_CHAR) const
282  {
283  return this->aaProbability(sequence.cbegin(), sequence.size(), first_char);
284  }
285 
286  prob_t aaProbability(std::string::const_iterator start,
287  seq_len_t sequence_len,
288  char first_char = NULL_CHAR) const
289  {
290  throw(std::runtime_error("not implemented yet"));
291  }
292 
293  prob_t aaProbability(std::string::const_reverse_iterator start,
294  seq_len_t sequence_len,
295  char first_char = NULL_CHAR) const
296  {
297  throw(std::runtime_error("not implemented yet"));
298  }
300 
301 
302  sequence_t generate(seq_len_t len, std::default_random_engine &rg, char first_char = NULL_CHAR, bool reverse = false) const {
303  std::string res = "";
304  if (len) {
305  std::discrete_distribution<int> distr;
306  distr = std::discrete_distribution<int>(_arr.get(), _arr.get() + 4);
307 
308  for (seq_len_t i = 0; i < len; ++i) {
309  res += inv_nuc_hash(distr(rg));
310  }
311  }
312 
313  return res;
314  }
315 
316  protected:
317 
318  };
319 
320 
322  public:
323 
324 
326  : AbstractInsertionModel(16, 0)
327  {
328  }
329 
330 
331  DiNucInsertionModel(prob_t err_prob)
332  : AbstractInsertionModel(16, err_prob)
333  {
334  }
335 
336 
337  DiNucInsertionModel(std::vector<prob_t>::const_iterator start, prob_t err_prob = 0)
338  : AbstractInsertionModel(16, err_prob)
339  {
340  this->updateProbabilities(start);
341  }
342 
343 
344  DiNucInsertionModel(const event_matrix_t& mat, prob_t err_prob = 0)
345  : AbstractInsertionModel(16, err_prob)
346  {
347  this->updateProbabilitiesMatrix(mat);
348  }
349 
350 
352  : AbstractInsertionModel(other)
353  {
354  }
355 
356 
357  DiNucInsertionModel& operator=(const DiNucInsertionModel &other) {
358  AbstractInsertionModel::operator=(other);
359 // _err_prob = other._err_prob;
360 // if (_arr.get() != other._arr.get()) {
361 // _arr.reset(new prob_t[16]);
362 // this->updateProbabilities(other._arr.get());
363 // }
364  return *this;
365  }
366 
367 
369  prob_t nucProbability(const std::string& sequence, char first_char = NULL_CHAR, bool with_errors = false) const {
370  return this->nucProbability(sequence.cbegin(), sequence.size(), first_char, with_errors);
371  }
372 
373  prob_t nucProbability(std::string::const_iterator start, seq_len_t sequence_len, char first_char = NULL_CHAR, bool with_errors = false) const {
374  prob_t res = 1;
375 
376  if (sequence_len) {
377  auto tmp1 = start, tmp2 = start + 1;
378  auto next = start + 1;
379  if (!with_errors) {
380  res = (first_char == NULL_CHAR) ? .25 : (*this)(nuc_hash(first_char), nuc_hash(*start));
381  for (seq_len_t i = 1; i < sequence_len; ++i, ++start, ++next) {
382  res *= (*this)(nuc_hash(*start), nuc_hash(*next));
383  }
384  } else {
385  prob_t err2 = _err_prob * _err_prob;
386  res = (first_char == NULL_CHAR) ? .25 : (err2 + (1 - err2) * (*this)(nuc_hash(first_char), nuc_hash(*start)));
387  for (seq_len_t i = 1; i < sequence_len; ++i, ++start, ++next) {
388  res *= (err2 + (1 - err2) * (*this)(nuc_hash(*start), nuc_hash(*next)));
389  }
390  }
391  }
392 
393  return res;
394  }
395 
396  prob_t nucProbability(std::string::const_reverse_iterator start, seq_len_t sequence_len, char first_char = NULL_CHAR, bool with_errors = false) const {
397  prob_t res = 1;
398 
399  if (sequence_len) {
400  auto tmp1 = start, tmp2 = start + 1;
401  auto next = start + 1;
402  if (!with_errors) {
403  res = (first_char == NULL_CHAR) ? .25 : (*this)(nuc_hash(first_char), nuc_hash(*start));
404  for (seq_len_t i = 1; i < sequence_len; ++i, ++start, ++next) {
405  res *= (*this)(nuc_hash(*start), nuc_hash(*next));
406  }
407  } else {
408  prob_t err2 = _err_prob * _err_prob;
409  res = (first_char == NULL_CHAR) ? .25 : (err2 + (1 - err2) * (*this)(nuc_hash(first_char), nuc_hash(*start)));
410  for (seq_len_t i = 1; i < sequence_len; ++i, ++start, ++next) {
411  res *= (err2 + (1 - err2) * (*this)(nuc_hash(*start), nuc_hash(*next)));
412  }
413  }
414  }
415 
416  return res;
417  }
419 
420 
422  prob_t aaProbability(const std::string& sequence,
423  char first_char = NULL_CHAR) const
424  {
425  return this->aaProbability(sequence.cbegin(), sequence.size(), first_char);
426  }
427 
428  prob_t aaProbability(std::string::const_iterator start,
429  seq_len_t sequence_len,
430  char first_char = NULL_CHAR) const
431  {
432  throw(std::runtime_error("not implemented yet"));
433  }
434 
435  prob_t aaProbability(std::string::const_reverse_iterator start,
436  seq_len_t sequence_len,
437  char first_char = NULL_CHAR) const
438  {
439  throw(std::runtime_error("not implemented yet"));
440  }
442 
443 
444  sequence_t generate(seq_len_t len, std::default_random_engine &rg, char first_char = NULL_CHAR, bool reverse = false) const {
445  std::string res = "";
446  if (len) {
447  std::discrete_distribution<int> distrs[] = {
448  std::discrete_distribution<int>(_arr.get(), _arr.get() + 4),
449  std::discrete_distribution<int>(_arr.get() + 4, _arr.get() + 8),
450  std::discrete_distribution<int>(_arr.get() + 8, _arr.get() + 12),
451  std::discrete_distribution<int>(_arr.get() + 12, _arr.get() + 16)
452  };
453 
454  if (first_char == NULL_CHAR) {
455  res = inv_nuc_hash(distrs[std::discrete_distribution<int>{.25, .25, .25, .25}(rg)](rg));
456  } else {
457  res = inv_nuc_hash(distrs[nuc_hash(first_char)](rg));
458  }
459  for (seq_len_t i = 1; i < len; ++i) {
460  res += inv_nuc_hash(distrs[nuc_hash(res[i - 1])](rg));
461  }
462 
463  if (reverse) { std::reverse(res.begin(), res.end()); }
464  }
465 
466  return res;
467  }
468 
469 
470  prob_t operator()(uint8_t row, uint8_t col) const { return _arr[4*row + col]; }
471 
472  protected:
473 
474  void updateProbabilitiesMatrix(const event_matrix_t& mat) {
475  for (uint8_t i = 0; i < 4; ++i) {
476  for (uint8_t j = 0; j < 4; ++j) {
477  _arr[4*i + j] = mat(i, j);
478  }
479  }
480  }
481 
482  void updateProbabilitiesMatrix(const event_matrix_t& mat, prob_t err_prob) {
483  this->updateProbabilitiesMatrix(mat);
484  _err_prob = err_prob;
485  }
486 
487  };
488 
489 
490  // amino acid insertion model
491 
492 }
493 
494 #endif //YMIR_INSERTIONMODEL_H
Definition: aligner.h:37
Class for representing VJ / VD / DJ insertions models - either a mono-nucleotide or a di-nucleotide m...
Definition: insertionmodel.h:27
Simple matrix class.
Definition: matrix.h:21
prob_t _err_prob
Definition: insertionmodel.h:142
virtual prob_t nucProbability(const std::string &sequence, char first_char=NULL_CHAR, bool with_errors=false) const =0
Probability of the given nucleotide sequence.
prob_t nucProbability(const std::string &sequence, char first_char=NULL_CHAR, bool with_errors=false) const
Probability of the given nucleotide sequence.
Definition: insertionmodel.h:369
prob_t nucProbability(const std::string &sequence, char first_char=NULL_CHAR, bool with_errors=false) const
Probability of the given nucleotide sequence.
Definition: insertionmodel.h:222
Definition: insertionmodel.h:321
Definition: insertionmodel.h:183