Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
aligner.h
1 /*
2  * Ymir <imminfo.github.io/ymir>
3  *
4  * This file is part of Ymir, a fast C++ tool for computation of assembling
5  * probabilities, statistical inference of assembling statistical model
6  * and generation of artificial sequences of T-cell receptors data.
7  *
8  *
9  * Copyright 2015 Vadim Nazarov <vdn at mailbox dot com>
10  *
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  *
15  * http://www.apache.org/licenses/LICENSE-2.0
16  *
17  * Unless required by applicable law or agreed to in writing, software
18  * distributed under the License is distributed on an "AS IS" BASIS,
19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  * See the License for the specific language governing permissions and
21  * limitations under the License.
22  */
23 
24 #ifndef _ALIGNER_H_
25 #define _ALIGNER_H_
26 
27 
28 #include "aligner_parameters.h"
29 #include "alignment_matrix.h"
30 #include "clonotype_builder.h"
31 #include "genesegment.h"
32 
33 
34 using namespace std;
35 
36 
37 namespace ymir {
38 
42  template <typename AlignmentType, typename V_Aligner, typename D_Aligner, typename J_Aligner>
44 
45  public:
46 
51  {
52  }
53 
54 
59  VDJAlignerParameters params)
60  : _genes(genes), _params(params)
61  {
62  }
63 
64 
65  virtual ~VDJAlignerBase()
66  {
67  }
68 
69 
70  void set_genes(const VDJRecombinationGenes &genes) { _genes = genes; }
71 
72 
73  void set_parameters(VDJAlignerParameters params) { _params = params; }
74 
75 
79  AlignmentType alignVar(seg_index_t id, const sequence_t &sequence) const {
81  AlignmentType vec;
82  this->_alignVar(id, sequence, &vec);
83  return vec;
84  }
85 
86  AlignmentType alignDiv(seg_index_t id, const sequence_t &sequence) const {
87  AlignmentType vec;
88  this->_alignDiv(id, sequence, &vec);
89  return vec;
90  }
91 
92  AlignmentType alignJoi(seg_index_t id, const sequence_t &sequence) const {
93  AlignmentType vec;
94  this->_alignJoi(id, sequence, &vec);
95  return vec;
96  }
98 
99 
107  bool alignVar() {
110  for (seg_index_t id = 1; id <= _genes.V().max(); ++id) {
111  this->_alignVar(id, _sequence, &vec);
112  }
113  this->addVarAlignment(vec);
114  return vec.size() != 0;
115  }
116 
117  bool alignDiv() {
119  for (seg_index_t id = 1; id <= _genes.D().max(); ++id) {
120  vec.clear(); // TODO: some strange behaviour here after I added this line. Be careful. Maybe there is some bug in clear().
121  this->_alignDiv(id, _sequence, &vec);
122  if (vec.size()) {
123  this->addDivAlignment(vec);
124  }
125  }
126  return vec.size() != 0;
127  }
128 
129  bool alignJoi() {
131  for (seg_index_t id = 1; id <= _genes.J().max(); ++id) {
132  this->_alignJoi(id, _sequence, &vec);
133  }
134  this->addJoiAlignment(vec);
135  return vec.size() != 0;
136  }
138 
139 
140  protected:
141 
142  VDJAlignerParameters _params;
143  VDJRecombinationGenes _genes;
144 
145  V_Aligner _V_Aligner;
146  D_Aligner _D_Aligner;
147  J_Aligner _J_Aligner;
148 
149 
157  void _alignVar(seg_index_t id, const sequence_t &sequence, AlignmentType *vec) const {
159  _V_Aligner(id, _genes.V()[id].sequence, sequence, vec, _params);
160  }
161 
162  void _alignDiv(seg_index_t id, const sequence_t &sequence, AlignmentType *vec) const {
163  _D_Aligner(id, _genes.D()[id].sequence, sequence, vec, _params);
164  }
165 
166  void _alignJoi(seg_index_t id, const sequence_t &sequence, AlignmentType *vec) const {
167  _J_Aligner(id, _genes.J()[id].sequence, sequence, vec, _params);
168  }
170 
171  };
172 
173 
174  // class NaiveAminoAcidAligner : public AbstractAligner {
175 
176  // public:
177 
178  // NaiveAminoAcidAligner() { }
179 
180 
181  // virtual seq_len_t align5end(const string& pattern, const string& text) const {
182  // seq_len_t p_size = pattern.size(), t_size = text.size(), matches = 0, max_matches = 0, all_matches = 0;
183  // string codon_s = "";
184  // for (seq_len_t i = 0; i < std::min((seq_len_t) (1 + p_size / 3), t_size); ++i) {
185  // max_matches = 0;
186  // // go through all codons and find the maximal match
187  // CodonTable::Codons codon = _codons.codons(text[i]);
188  // while(!codon.end()) {
189  // matches = 0;
190  // codon_s = codon.next();
191  // if (pattern[i*3] == codon_s[0] && i*3 < p_size) {
192  // ++matches;
193  // if (pattern[i*3 + 1] == codon_s[1] && (i*3 + 1) < p_size) {
194  // ++matches;
195  // if (pattern[i*3 + 2] == codon_s[2] && (i*3 + 2) < p_size) {
196  // ++matches;
197  // }
198  // }
199  // }
200 
201  // if (matches > max_matches) {
202  // max_matches = matches;
203  // if (max_matches == 3) { break; }
204  // }
205  // }
206 
207  // // if match == 3 then go to the next amino acid
208  // all_matches += max_matches;
209  // if (max_matches != 3) { break; }
210  // }
211 
212  // return all_matches;
213  // }
214 
215 
216  // virtual seq_len_t align3end(const string& pattern, const string& text) const {
217  // seq_len_t p_size = pattern.size(), t_size = text.size(), matches = 0, max_matches = 0, all_matches = 0;
218  // string codon_s = "";
219  // for (seq_len_t i = 0; i < std::min((seq_len_t) (1 + p_size / 3), t_size); ++i) {
220  // max_matches = 0;
221  // // go through all codons and find the maximal match
222  // CodonTable::Codons codon = _codons.codons(text[t_size - i - 1]);
223  // while(!codon.end()) {
224  // matches = 0;
225  // codon_s = codon.next();
226  // if (pattern[p_size - 1 - i*3] == codon_s[2] && i*3 < p_size) {
227  // ++matches;
228  // if (pattern[p_size - 1 - i*3 - 1] == codon_s[1] && (i*3 + 1) < p_size) {
229  // ++matches;
230  // if (pattern[p_size - 1 - i*3 - 2] == codon_s[0] && (i*3 + 2) < p_size) {
231  // ++matches;
232  // }
233  // }
234  // }
235 
236  // if (matches > max_matches) {
237  // max_matches = matches;
238  // if (max_matches == 3) { break; }
239  // }
240  // }
241 
242  // // if match == 3 then go to the next amino acid
243  // all_matches += max_matches;
244  // if (max_matches != 3) { break; }
245  // }
246 
247  // return all_matches;
248  // }
249 
250 
251  // virtual LocalAlignmentIndices alignLocal(const string& pattern, const string& text, seq_len_t match_min_len = 3) const {
252  // return LocalAlignmentIndices(vector<seq_len_t>(1));
253  // }
254 
255  // protected:
256 
257  // const CodonTable _codons;
258 
259  // };
260 
261 
262  //
263  // CDR3-only alignment without errors - align V starting from the left edge,
264  // J starting from the right edge, and align D everywhere.
265  //
266 
267 
273  void operator()(seg_index_t gene,
274  const sequence_t &pattern,
275  const sequence_t &text,
276  NoGapAlignmentVector *avec,
277  const VDJAlignerParameters &params = VDJAlignerParameters()) const
278  {
279  seq_len_t p_size = pattern.size(), t_size = text.size(), matches = 0;
280 
281  for (seq_len_t i = 0; i < min(p_size, t_size); ++i) {
282  if (pattern[i] != text[i]) { break; }
283  matches += 1;
284  }
285 
286  avec->addAlignment(gene, 1, 1, matches);
287  }
288  };
289 
291  void operator()(seg_index_t gene,
292  const sequence_t &pattern,
293  const sequence_t &text,
294  NoGapAlignmentVector *avec,
295  const VDJAlignerParameters &params = VDJAlignerParameters()) const
296  {
297  seq_len_t match_min_len = params.min_D_len;
298  seq_len_t t_size = text.size(), p_size = pattern.size(), min_size = min(t_size, p_size), min_subsize;
299  bool open_match;
300  seq_len_t p_start, t_start;
301 
302  for (seq_len_t pattern_i = 0 /* WTF?! */ ; pattern_i < p_size - match_min_len + 1; ++pattern_i) {
303  open_match = false;
304  min_subsize = min(p_size - pattern_i, (int) t_size);
305  for (seq_len_t i = 0; i < min_subsize; ++i) {
306  if (pattern[pattern_i + i] == text[i]) {
307  if (!open_match) {
308  p_start = pattern_i + i;
309  t_start = i;
310  open_match = true;
311  }
312  } else if (open_match) {
313  if ((pattern_i + i - p_start) >= match_min_len) {
314  avec->addAlignment(gene, p_start + 1, t_start + 1, pattern_i + i - p_start);
315  }
316  open_match = false;
317  }
318  }
319  if (open_match && (pattern_i + min_subsize - p_start) >= match_min_len) {
320  avec->addAlignment(gene, p_start + 1, t_start + 1, pattern_i + min_subsize - p_start);
321  }
322  }
323 
324  for (seq_len_t text_i = 1; text_i < t_size - match_min_len + 1; ++text_i) {
325  open_match = false;
326  min_subsize = min((int) p_size, t_size - text_i);
327  for (seq_len_t i = 0; i < min_subsize; ++i) {
328  if (pattern[i] == text[text_i + i]) {
329  if (!open_match) {
330  p_start = i;
331  t_start = text_i + i;
332  open_match = true;
333  }
334  } else if (open_match) {
335  if ((i - p_start) >= match_min_len) {
336  avec->addAlignment(gene, p_start + 1, t_start + 1, i - p_start);
337  }
338  open_match = false;
339  }
340  }
341  if (open_match && (min_subsize - p_start) >= match_min_len) {
342  avec->addAlignment(gene, p_start + 1, t_start + 1, min_subsize - p_start);
343  }
344  }
345  }
346  };
347 
349  void operator()(seg_index_t gene,
350  const sequence_t &pattern,
351  const sequence_t &text,
352  NoGapAlignmentVector *avec,
353  const VDJAlignerParameters &params = VDJAlignerParameters()) const
354  {
355  seq_len_t p_size = pattern.size(), t_size = text.size(), matches = 0;
356 
357  for (seq_len_t i = 0; i < min(p_size, t_size); ++i) {
358  if (pattern[p_size - i - 1] != text[t_size - i - 1]) { break; }
359  matches += 1;
360  }
361 
362  avec->addAlignment(gene, p_size - matches + 1, t_size - matches + 1, matches);
363  }
364  };
366 
367 
375 
376 
377  //
378  // CDR3-only alignment with errors - align V starting from the left edge,
379  // J starting from the right edge, and align D everywhere.
380  //
381 
382 
386  struct CDR3AlignerFunctor_V {
388  void operator()(seg_index_t gene,
389  const sequence_t &pattern,
390  const sequence_t &text,
391  NoGapAlignmentVector *avec,
392  const VDJAlignerParameters &params = VDJAlignerParameters()) const
393  {
394  seq_len_t p_size = pattern.size(), t_size = text.size();
395  NoGapAlignment::events_storage_t vec;
396  vec.reserve(min(p_size, t_size) + 1);
397  alignment_score_t score = 0, val; //, max_score = 0;
398 
399  vec.push_back(pattern[0] != text[0]);
400  score += pattern[0] == text[0] ? params.score.v_score.match : params.score.v_score.mism;
401  for (seq_len_t i = 1; i < min(p_size, t_size); ++i) {
402  vec.push_back(pattern[i] != text[i]);
403 // val = pattern[i] == text[i] ? params.score.v_score.match : (params.score.v_score.mism - params.score.v_score.acc_mism*(pattern[i] != text[i]));
404  score += pattern[i] == text[i] ? (params.score.v_score.match + params.score.v_score.acc_match * (pattern[i - 1] == text[i - 1])) : params.score.v_score.mism;
405  // max_score = std::max(max_score, score);
406  }
407 
408  // if (max_score >= params.threshold.v_threshold) {
409  if (score >= params.threshold.v_threshold) {
410  avec->addAlignment(gene, 1, 1, vec);
411  }
412  }
413  };
414 
416  void operator()(seg_index_t gene,
417  const sequence_t &pattern,
418  const sequence_t &text,
419  NoGapAlignmentVector *avec,
420  const VDJAlignerParameters &params = VDJAlignerParameters()) const
421  {
422  seq_len_t match_min_len = params.min_D_len;
423  seq_len_t t_size = text.size(), p_size = pattern.size(), min_size = min(t_size, p_size), min_subsize;
424  seq_len_t p_start, t_start;
425  AlignmentVectorBase::events_storage_t bitvec;
426  bitvec.reserve(p_size + 1);
427 
428  for (seq_len_t pattern_i = 0; pattern_i < p_size - match_min_len + 1; ++pattern_i) {
429  min_subsize = min(p_size - pattern_i, (int) t_size);
430  if (min_subsize >= match_min_len) {
431  bitvec.resize(min_subsize);
432  for (seq_len_t i = 0; i < min_subsize; ++i) {
433  bitvec[i] = pattern[pattern_i + i] != text[i];
434  }
435  avec->addAlignment(gene, pattern_i + 1, 1, bitvec);
436  }
437  }
438 
439  for (seq_len_t text_i = 1; text_i < t_size - match_min_len + 1; ++text_i) {
440  min_subsize = min((int) p_size, t_size - text_i);
441  if (min_subsize >= match_min_len) {
442  bitvec.resize(min_subsize);
443  for (seq_len_t i = 0; i < min_subsize; ++i) {
444  bitvec[i] = pattern[i] != text[text_i + i];
445  }
446  avec->addAlignment(gene, 1, text_i + 1, bitvec);
447  }
448  }
449  }
450  };
451 
453  void operator()(seg_index_t gene,
454  const sequence_t &pattern,
455  const sequence_t &text,
456  NoGapAlignmentVector *avec,
457  const VDJAlignerParameters &params = VDJAlignerParameters()) const
458  {
459  seq_len_t p_size = pattern.size(), t_size = text.size();
460  NoGapAlignment::events_storage_t vec;
461  vec.reserve(min(p_size, t_size) + 1);
462  alignment_score_t score = 0, val; //, max_score = 0;
463 
464  vec.insert(vec.begin(), pattern[p_size - 1] != text[t_size - 1]);
465  for (seq_len_t i = 1; i < min(p_size, t_size); ++i) {
466  vec.insert(vec.begin(), pattern[p_size - i - 1] != text[t_size - i - 1]);
467 // val = pattern[p_size - i - 1] == text[t_size - i - 1] ? params.score.j_score.match : (params.score.j_score.mism - params.score.j_score.acc_mism*(pattern[p_size - i - 1] != text[t_size - i - 1]));
468  score += pattern[p_size - i - 1] == text[t_size - i - 1] ? (params.score.j_score.match + params.score.j_score.acc_match * (pattern[p_size - i] == text[t_size - i])) : params.score.j_score.mism;
469  // max_score = std::max(max_score, score);
470  }
471 
472  // if (max_score >= params.threshold.j_threshold) {
473  if (score >= params.threshold.j_threshold) {
474  avec->addAlignment(gene, p_size - min(t_size, p_size) + 1, t_size - min(t_size, p_size) + 1, vec);
475  }
476  }
477  };
479 
480 
484  typedef VDJAlignerBase<NoGapAlignmentVector,
488 
489 
490  //
491  // Classic Smith-Waterman
492  //
493 
494 
498  struct SWAlignerFunctor_VJ {
500  void operator()(seg_index_t gene,
501  const sequence_t &pattern,
502  const sequence_t &text,
503  GappedAlignmentVector *avec,
504  const VDJAlignerParameters &params = VDJAlignerParameters()) const
505  {
506  SWAlignmentMatrix mat(gene, pattern, text);
507 
508  mat.getBestAlignment(avec, pattern, text);
509  }
510 
511  private:
512 
513  SWAlignmentMatrix _matrix;
514 
515  };
516 
518  void operator()(seg_index_t gene,
519  const sequence_t &pattern,
520  const sequence_t &text,
521  GappedAlignmentVector *avec,
522  const VDJAlignerParameters &params = VDJAlignerParameters()) const
523  {
524  check_and_throw(false, "SWAlignerFunctor_D has not been implemented yet");
525  }
526 
527  private:
528 
529  SWAlignmentMatrix _matrix;
530 
531  };
533 
534 
543  SWAlignerFunctor_VJ> SmithWatermanAligner;
544 
545 
546  //
547  // Smith-Waterman with no gap allowed, but with errors
548  //
549 
550 
554  struct SWNGAlignerFunctor_V {
556  void operator()(seg_index_t gene,
557  const sequence_t &pattern,
558  const sequence_t &text,
559  NoGapAlignmentVector *avec,
560  const VDJAlignerParameters &params = VDJAlignerParameters()) const
561  {
562  SWNGAlignmentMatrix mat(gene, pattern, text);
563 
564  for (seq_len_t col_i = 0; col_i < text.size(); ++col_i) {
565  for (seq_len_t row_i = 0; row_i < pattern.size(); ++row_i) {
566  mat.score(row_i + 1, col_i + 1) = std::max({mat.score(row_i, col_i) + (text[col_i] == pattern[row_i] ? params.score.v_score.match : params.score.v_score.mism), .0});
567  }
568  }
569 
570  mat.getBestAlignment(avec, pattern, text);
571  }
572  };
573 
575  void operator()(seg_index_t gene,
576  const sequence_t &pattern,
577  const sequence_t &text,
578  NoGapAlignmentVector *avec,
579  const VDJAlignerParameters &params = VDJAlignerParameters()) const
580  {
581  // avec->addAlignment(gene, );
582  seq_len_t match_min_len = params.min_D_len;
583  seq_len_t t_size = text.size(), p_size = pattern.size(), min_size = min(t_size, p_size), min_subsize;
584  seq_len_t p_start, t_start;
585  AlignmentVectorBase::events_storage_t bitvec;
586  bitvec.reserve(p_size + 1);
587 
588  for (seq_len_t pattern_i = 0; pattern_i < p_size - match_min_len + 1; ++pattern_i) {
589  min_subsize = min(p_size - pattern_i, (int) t_size);
590  if (min_subsize >= match_min_len) {
591  bitvec.resize(min_subsize);
592  for (seq_len_t i = 0; i < min_subsize; ++i) {
593  bitvec[i] = pattern[pattern_i + i] != text[i];
594  }
595  avec->addAlignment(gene, pattern_i + 1, 1, bitvec);
596  }
597  }
598 
599  for (seq_len_t text_i = 1; text_i < t_size - match_min_len + 1; ++text_i) {
600  min_subsize = min((int) p_size, t_size - text_i);
601  if (min_subsize >= match_min_len) {
602  bitvec.resize(min_subsize);
603  for (seq_len_t i = 0; i < min_subsize; ++i) {
604  bitvec[i] = pattern[i] != text[text_i + i];
605  }
606  avec->addAlignment(gene, 1, text_i + 1, bitvec);
607  }
608  }
609  }
610  };
611 
613  void operator()(seg_index_t gene,
614  const sequence_t &pattern,
615  const sequence_t &text,
616  NoGapAlignmentVector *avec,
617  const VDJAlignerParameters &params = VDJAlignerParameters()) const
618  {
619  SWNGAlignmentMatrix mat(gene, pattern, text);
620 
621  for (seq_len_t col_i = 0; col_i < text.size(); ++col_i) {
622  for (seq_len_t row_i = 0; row_i < pattern.size(); ++row_i) {
623  mat.score(row_i + 1, col_i + 1) = std::max({mat.score(row_i, col_i) + (text[col_i] == pattern[row_i] ? params.score.j_score.match : params.score.j_score.mism), .0});
624  }
625  }
626 
627  mat.getBestAlignment(avec, pattern, text);
628  }
629  };
631 
632 
638  typedef VDJAlignerBase<NoGapAlignmentVector,
642 
643 }
644 
645 #endif
Definition: aligner.h:348
Definition: aligner.h:37
Definition: aligner.h:574
Definition: aligner_parameters.h:129
Definition: aligner.h:290
Definition: aligner.h:499
Definition: nogap_alignment_vector.h:37
Definition: aligner.h:43
void addAlignment(seg_index_t id, seq_len_t p_start, seq_len_t t_start, seq_len_t size)
Add a new alignment to the vector.
Definition: nogap_alignment_vector.h:55
Definition: aligner.h:452
Definition: genesegment.h:265
Definition: aligner.h:272
Definition: aligner.h:612
Definition: clonotype_builder.h:37
Definition: aligner.h:415
Definition: aligner.h:517
Definition: gapped_alignment_vector.h:52
Definition: aligner.h:555
Definition: aligner.h:387