Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
maagbuilder.h
1 /*
2  * Ymir <imminfo.github.io/ymir>
3  *
4  * This file is part of Ymir, a fast C++ tool for computation of assembling
5  * probabilities, statistical inference of assembling statistical model
6  * and generation of artificial sequences of T-cell receptors data.
7  *
8  *
9  * Copyright 2015 Vadim Nazarov <vdn at mailbox dot com>
10  *
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  *
15  * http://www.apache.org/licenses/LICENSE-2.0
16  *
17  * Unless required by applicable law or agreed to in writing, software
18  * distributed under the License is distributed on an "AS IS" BASIS,
19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  * See the License for the specific language governing permissions and
21  * limitations under the License.
22  */
23 
24 #ifndef _MAAGBUILDER_H_
25 #define _MAAGBUILDER_H_
26 
27 #define DEFAULT_SEQ_POSES_RESERVE 300
28 
29 #define VARIABLE_GENES_MATRIX_INDEX 0
30 #define VARIABLE_DELETIONS_MATRIX_INDEX 1
31 #define JOINING_DELETIONS_VJ_MATRIX_INDEX 3
32 #define JOINING_GENES_VDJ_MATRIX_INDEX 6
33 #define JOINING_DELETIONS_VDJ_MATRIX_INDEX 5
34 #define DIVERSITY_GENES_MATRIX_INDEX 3
35 #define VarJoi_INSERTIONS_MATRIX_INDEX 2
36 #define VarDiv_INSERTIONS_MATRIX_INDEX 2
37 #define DivJoi_INSERTIONS_MATRIX_INDEX 4
38 
39 
40 #include "clonotype.h"
41 #include "genesegment.h"
42 #include "insertionmodel.h"
43 #include "maag.h"
44 #include "modelparametervector.h"
45 #include "repertoire.h"
46 
47 
48 namespace ymir {
49 
50  class MAAGBuilder;
51 
52 
56  class MAAGBuilder : protected MAAG {
57 
58  public:
59 
60 // static const size_t verbose_step = 10000;
61 
62 
66  MAAGBuilder(const ModelParameterVector &param_vec, const VDJRecombinationGenes &genes)
67  : MAAG(),
68  _param_vec(new ModelParameterVector(param_vec)),
69  _genes(new VDJRecombinationGenes(genes))
70  {
71  }
72 
73 
74  virtual ~MAAGBuilder()
75  {
76  }
77 
78 
79  void updateModelParameterVector(const ModelParameterVector &param_vec) {
80  *(_param_vec.get()) = param_vec;
81  }
82 
83 
94  MAAG build(const Clonotype &clonotype,
96  MetadataMode metadata_mode,
97  ErrorMode error_mode,
98  SequenceType seq_type = NUCLEOTIDE) const {
99  if (clonotype.is_good())
100  {
101  ProbMMC probs;
102  EventIndMMC events;
103  ErrMMC errors;
104  vector<seq_len_t> seq_poses;
105  seq_poses.reserve(DEFAULT_SEQ_POSES_RESERVE);
106 
107  auto resize_size = 0, e_resize_size = 0;
108  switch (clonotype.recombination()) {
109  case VJ_RECOMB:
110  resize_size = VJ_CHAIN_SIZE;
111  e_resize_size = 2;
112  break;
113 
114  case VDJ_RECOMB:
115  resize_size = VDJ_CHAIN_SIZE;
116  e_resize_size = 3;
117  break;
118 
119  default:
120 #ifndef DNDEBUG
121  check_and_throw(false, "MAAGBuilder: unknown recombination type.");
122 #endif
123  }
124 
125  probs.resize(resize_size);
126  if (metadata_mode) {
127  events.resize(resize_size);
128  }
129  if (error_mode) {
130  errors.resize(e_resize_size);
131  }
132 
133  this->buildVariable(clonotype, probs, events, errors, seq_poses, metadata_mode, error_mode);
134  this->buildJoining(clonotype, probs, events, errors, seq_poses, metadata_mode, error_mode);
135  if (clonotype.recombination() == VJ_RECOMB) {
136  this->buildVJinsertions(clonotype, probs, events, seq_poses, metadata_mode, error_mode);
137  } else if (clonotype.recombination() == VDJ_RECOMB) {
138  this->buildDiversity(clonotype, probs, events, errors, seq_poses, metadata_mode, error_mode);
139  this->buildVDinsertions(clonotype, probs, events, seq_poses, metadata_mode, error_mode);
140  this->buildDJinsertions(clonotype, probs, events, seq_poses, metadata_mode, error_mode);
141  }
142 
143 // if (error_mode && metadata_mode) {
144 //
145 // // TODO: deal with D deletions and insertions null matrices
146 // // - if the D deletions matrix contains only zeros, then remove this matrix
147 // // - if insertions matrices have columns / rows with only zero (!) events (!), then remove it
148 // // and remove the corresponding deletions rows / columns from neighbour matrices.
149 // //
150 // if (clonotype.recombination() == VDJ_RECOMB) {
151 //
152 // }
153 //
154 // unique_ptr<seq_len_t[]> seq_poses_arr(new seq_len_t[seq_poses.size()]);
155 // copy(seq_poses.begin(), seq_poses.end(), seq_poses_arr.get());
156 // return MAAG(probs, events, errors, clonotype.sequence(), seq_poses_arr, seq_poses.size(), seq_type);
157 // } else if (metadata_mode) {
158 // unique_ptr<seq_len_t[]> seq_poses_arr(new seq_len_t[seq_poses.size()]);
159 // copy(seq_poses.begin(), seq_poses.end(), seq_poses_arr.get());
160 // return MAAG(probs, events, clonotype.sequence(), seq_poses_arr, seq_poses.size(), seq_type);
161 // } else if (error_mode) {
162 // return MAAG(probs, errors);
163 // } else {
164 // return MAAG(probs);
165 // }
166 
167 // std::cout << clonotype.toString() << std::endl;
168 
169  probs.finish();
170 
171  MAAG maag;
172  maag._recomb = clonotype.recombination();
173  maag._seq_type = clonotype.sequence_type();
174  maag.swap(probs);
175  if (error_mode) {
176  errors.finish();
177 
178  maag._errors.reset(new ErrMMC());
179  maag._errors->swap(errors);
180  }
181  if (metadata_mode) {
182  events.finish();
183 
184  unique_ptr<seq_len_t[]> seq_poses_arr(new seq_len_t[seq_poses.size()]);
185  copy(seq_poses.begin(), seq_poses.end(), seq_poses_arr.get());
186  maag._sequence.reset(new sequence_t(clonotype.sequence()));
187  maag._seq_poses.swap(seq_poses_arr);
188  maag._n_poses = seq_poses.size();
189 
190  maag._events.reset(new EventIndMMC());
191  maag._events->swap(events);
192  }
193  return maag;
194  } else {
195  return MAAG();
196  }
197  }
198 
199  MAAGRepertoire build(const ClonesetView &cloneset,
200  MetadataMode metadata_mode,
201  ErrorMode error_mode,
202  SequenceType seq_type = NUCLEOTIDE,
203  bool verbose = true) const
204  {
205  size_t verbose_step;
206 
207  if (verbose) {
208  std::cout << "Building " << (size_t) cloneset.size() << " MAAGs..." << std::endl;
209  verbose_step = cloneset.size() / 10;
210  }
211 
212  MAAGRepertoire res;
213  res.resize(cloneset.size());
214 #ifdef USE_OMP
215 #if OMP_THREADS == -1
216  #pragma omp parallel for
217 #else
218  #pragma omp parallel for num_threads(OMP_THREADS)
219 #endif
220 #endif
221  for (size_t i = 0; i < cloneset.size(); ++i) {
222  res[i] = this->build(cloneset[i], metadata_mode, error_mode, seq_type);
223 
224 #ifndef USE_OMP
225  if (verbose && (i+1) % verbose_step == 0 && (i+1) != cloneset.size()) {
226  cout << "[" << (int) ((100*(i+1)) / cloneset.size()) << "%] "<< "Built " << (int) (i+1) << " / " << (int) (cloneset.size()) << " MAAGs." << endl;
227  }
228 #endif
229  }
230 
231  if (verbose) {
232  cout << "[100%] Built " << (int) (cloneset.size()) << " MAAGs." << endl;
233  }
234 
235  return res;
236  }
238 
239 
249  prob_t buildAndCompute(const Clonotype &clonotype,
251  ErrorMode error_mode,
252  SequenceType seq_type = NUCLEOTIDE,
253  MAAGComputeProbAction action = SUM_PROBABILITY) const {
254  return this->build(clonotype, NO_METADATA, error_mode, seq_type).fullProbability(action);
255  }
256 
257  vector<prob_t> buildAndCompute(const ClonesetView &cloneset,
258  ErrorMode error_mode,
259  SequenceType seq_type = NUCLEOTIDE,
260  MAAGComputeProbAction action = SUM_PROBABILITY,
261  bool verbose = true) const
262  {
263  vector<prob_t> res;
264  res.reserve(cloneset.size());
265  size_t verbose_step;
266 
267  if (verbose) {
268  std::cout << "Computing assembling probabilities on " << (size_t) cloneset.size() << " clonotypes." << std::endl;
269  verbose_step = cloneset.size() / 10;
270  }
271 
272 #ifdef USE_OMP
273 #if OMP_THREADS == -1
274  #pragma omp parallel for
275 #else
276  #pragma omp parallel for num_threads(OMP_THREADS)
277 #endif
278 #endif
279  for (size_t i = 0; i < cloneset.size(); ++i) {
280  res.push_back(buildAndCompute(cloneset[i], error_mode, seq_type, action));
281 
282 #ifndef USE_OMP
283  if (verbose && (i+1) % verbose_step == 0 && (i+1) != cloneset.size()) {
284  std::cout << "[" << (int) ((100*(i+1)) / cloneset.size()) << "%] " << "Computed " << (int) (i+1) << " / " << (size_t) cloneset.size() << " assembling probabilities." << std::endl;
285  }
286 #endif
287  }
288 
289  if (verbose) {
290  std::cout << "[100%] Computed " << (size_t) cloneset.size() << " assembling probabilities." << std::endl;
291  }
292 
293  return res;
294  }
296 
297 
304  void updateEventProbabilities(MAAG *maag) const {
306  if (maag->has_events()) {
307  vector<seq_len_t> seq_poses_vec(maag->_seq_poses.get(), maag->_seq_poses.get() + maag->_n_poses);
308  for (int node_i = 0; node_i < maag->chainSize(); ++node_i) {
309  // either rebuild all insertions
310  if (maag->is_vj() && node_i == VarJoi_INSERTIONS_MATRIX_INDEX) {
311  MonoNucInsertionModel im(_param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_NUC, 0, 0))); // TODO: add errors here?
312 
313  seq_len_t v_vertices = maag->nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
314  j_vertices = maag->nodeRows(JOINING_DELETIONS_VJ_MATRIX_INDEX);
315 
316  this->buildInsertions(maag->sequence(),
317  *maag,
318  *maag->_events,
319  seq_poses_vec,
320  VarJoi_INSERTIONS_MATRIX_INDEX,
321  _param_vec->event_index(VJ_VAR_JOI_INS_LEN, 0, 0),
322  _param_vec->max_VJ_ins_len(),
323  NO_METADATA,
324  maag->has_errors(),
325  0,
326  v_vertices - 1,
327  v_vertices,
328  v_vertices + j_vertices - 1,
329  im,
330  false);
331 
332  } else if (maag->is_vdj() && node_i == VarDiv_INSERTIONS_MATRIX_INDEX) {
333  DiNucInsertionModel im(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC, 0, 0))); // TODO: add errors here?
334 
335  seq_len_t v_vertices = maag->nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
336  d3_vertices = maag->nodeRows(DIVERSITY_GENES_MATRIX_INDEX);
337 
338  this->buildInsertions(maag->sequence(),
339  *maag,
340  *maag->_events,
341  seq_poses_vec,
342  VarDiv_INSERTIONS_MATRIX_INDEX,
343  _param_vec->event_index(VDJ_VAR_DIV_INS_LEN, 0, 0),
344  _param_vec->max_VD_ins_len(),
345  NO_METADATA,
346  maag->has_errors(),
347  0,
348  v_vertices - 1,
349  v_vertices,
350  v_vertices + d3_vertices - 1,
351  im,
352  false);
353 
354  } else if (maag->is_vdj() && node_i == DivJoi_INSERTIONS_MATRIX_INDEX) {
355  DiNucInsertionModel im(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC, 0, 0)));
356 
357  seq_len_t v_vertices = maag->nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
358  d3_vertices = maag->nodeRows(DIVERSITY_GENES_MATRIX_INDEX),
359  d5_vertices = maag->nodeColumns(DIVERSITY_GENES_MATRIX_INDEX),
360  j_vertices = maag->nodeRows(JOINING_DELETIONS_VDJ_MATRIX_INDEX);
361 
362  this->buildInsertions(maag->sequence(),
363  *maag,
364  *maag->_events,
365  seq_poses_vec,
366  DivJoi_INSERTIONS_MATRIX_INDEX,
367  _param_vec->event_index(VDJ_DIV_JOI_INS_LEN, 0, 0),
368  _param_vec->max_DJ_ins_len(),
369  NO_METADATA,
370  maag->has_errors(),
371  v_vertices + d3_vertices,
372  v_vertices + d3_vertices + d5_vertices - 1,
373  v_vertices + d3_vertices + d5_vertices,
374  v_vertices + d3_vertices + d5_vertices + j_vertices - 1,
375  im,
376  true);
377 
378  } else {
379  // or just replace all event probabilities with the new ones
380  if (!maag->has_errors()) {
381  for (int mat_i = 0; mat_i < maag->nodeSize(node_i); ++mat_i) {
382  for (int row_i = 0; row_i < maag->nodeRows(node_i); ++row_i) {
383  for (int col_i = 0; col_i < maag->nodeColumns(node_i); ++col_i) {
384  (*maag)(node_i, mat_i, row_i, col_i) =
385  (*_param_vec)[maag->event_index(node_i, mat_i, row_i, col_i)];
386  }
387  }
388  }
389  } else {
390  int err_node_i = 0;
391  if (maag->is_vj()) {
392  err_node_i = node_i == JOINING_DELETIONS_VJ_MATRIX_INDEX ? 1 : 0;
393  } else {
394  if (node_i != VARIABLE_DELETIONS_MATRIX_INDEX) {
395  if (node_i == JOINING_DELETIONS_VDJ_MATRIX_INDEX) {
396  err_node_i = 2;
397  } else {
398  err_node_i = 1;
399  }
400  }
401  }
402 
403  if (node_i != 0 && (maag->is_vdj() && node_i != JOINING_GENES_VDJ_MATRIX_INDEX)) {
404  for (int mat_i = 0; mat_i < maag->nodeSize(node_i); ++mat_i) {
405  for (int row_i = 0; row_i < maag->nodeRows(node_i); ++row_i) {
406  for (int col_i = 0; col_i < maag->nodeColumns(node_i); ++col_i) {
407  if (maag->errors(err_node_i, mat_i, row_i, col_i)) {
408  (*maag)(node_i, mat_i, row_i, col_i) =
409  (*_param_vec)[maag->event_index(node_i, mat_i, row_i, col_i)]
410  * _param_vec->error_prob() * maag->errors(err_node_i, mat_i, row_i, col_i);
411  } else {
412  (*maag)(node_i, mat_i, row_i, col_i) = (*_param_vec)[maag->event_index(node_i, mat_i, row_i, col_i)];
413  }
414  }
415  }
416  }
417  } else {
418  for (int mat_i = 0; mat_i < maag->nodeSize(node_i); ++mat_i) {
419  for (int row_i = 0; row_i < maag->nodeRows(node_i); ++row_i) {
420  for (int col_i = 0; col_i < maag->nodeColumns(node_i); ++col_i) {
421  (*maag)(node_i, mat_i, row_i, col_i) = (*_param_vec)[maag->event_index(node_i, mat_i, row_i, col_i)];
422  }
423  }
424  }
425  }
426 
427  }
428  }
429  }
430  }
431  }
432 
433  void updateEventProbabilities(MAAGRepertoire *repertoire, bool verbose = true) const {
434  size_t verbose_step;
435 
436  if (verbose) {
437  std::cout << "Updating " << (size_t) repertoire->size() << " MAAGs..." << std::endl;
438  verbose_step = repertoire->size() / 10;
439  }
440 
441 #ifdef USE_OMP
442 #if OMP_THREADS == -1
443  #pragma omp parallel for
444 #else
445  #pragma omp parallel for num_threads(OMP_THREADS)
446 #endif
447 #endif
448  for (size_t i = 0; i < repertoire->size(); ++i) {
449  this->updateEventProbabilities(&(*repertoire)[i]);
450 
451 #ifndef USE_OMP
452  if (verbose && (i+1) % verbose_step == 0 && (i+1) != repertoire->size()) {
453  cout << "[" << (int) ((100*(i+1)) / repertoire->size()) << "%] " << "Updated " << (size_t) (i+1) << " / " << (size_t) repertoire->size() << " MAAGs." << endl;
454  }
455 #endif
456  }
457 
458  if (verbose) {
459  cout << "[100%] Updated " << (int) (repertoire->size()) << " MAAGs." << endl;
460  }
461  }
463 
464 
465  protected:
466 
467  unique_ptr<ModelParameterVector> _param_vec; // or just copy it?
468  unique_ptr<VDJRecombinationGenes> _genes; // copy this too?
469 
470 
474  MAAGBuilder() : _param_vec(nullptr), _genes(nullptr) {}
475 
476 
486  void buildVariable(const Clonotype &clonotype,
487  ProbMMC &probs,
488  EventIndMMC &events,
489  ErrMMC &errors,
490  vector<seq_len_t> &seq_poses,
491  MetadataMode metadata_mode,
492  ErrorMode error_mode) const
493  {
494  // find max V alignment
495  seq_len_t len = 0;
496  seg_index_t v_num = clonotype.nVar(), j_num = clonotype.nJoi();
497  for (int v_index = 0; v_index < v_num; ++v_index) {
498  len = std::max(len, clonotype.getVarLen(v_index));
499  }
500 
501  // compute V deletions
502  seq_len_t v_len, v_gene, v_start, v_end;
503 
504  probs.initNode(VARIABLE_DELETIONS_MATRIX_INDEX, v_num, 1, len + 1);
505  if (metadata_mode) {
506  events.initNode(VARIABLE_DELETIONS_MATRIX_INDEX, v_num, 1, len + 1);
507  }
508  if (error_mode) {
509  errors.initNode(0, v_num, 1, len + 1);
510  }
511 
512  if (clonotype.recombination() == VJ_RECOMB) {
513  probs.initNode(VARIABLE_GENES_MATRIX_INDEX, 1, v_num, j_num);
514  if (metadata_mode) {
515  events.initNode(VARIABLE_GENES_MATRIX_INDEX, 1, v_num, j_num);
516  }
517  } else if (clonotype.recombination() == VDJ_RECOMB) {
518  probs.initNode(VARIABLE_GENES_MATRIX_INDEX, v_num, 1, 1);
519  if (metadata_mode) {
520  events.initNode(VARIABLE_GENES_MATRIX_INDEX, v_num, 1, 1);
521  }
522  }
523 
524  EventClass V_DEL = clonotype.recombination() == VJ_RECOMB ? VJ_VAR_DEL : VDJ_VAR_DEL;
525  for (seg_index_t v_index = 0; v_index < v_num; ++v_index) {
526 
527  // probability of choosing this V gene segment
528  v_gene = clonotype.getVar(v_index);
529 
530  if (clonotype.recombination() == VJ_RECOMB) {
531  for (seg_index_t j_index = 0; j_index < j_num; ++j_index) {
532  probs(VARIABLE_GENES_MATRIX_INDEX, 0, v_index, j_index)
533  = _param_vec->event_prob(VJ_VAR_JOI_GEN, 0, v_gene - 1, clonotype.getJoi(j_index) - 1);
534  }
535  } else if (clonotype.recombination() == VDJ_RECOMB) {
536  probs(VARIABLE_GENES_MATRIX_INDEX, v_index, 0, 0) = _param_vec->event_prob(VDJ_VAR_GEN, 0, v_gene - 1); // probability of choosing this V gene segment
537  }
538 
539  // V deletions
540  v_len = _genes->V()[v_gene].sequence.size();
541  v_start = clonotype.getVarGeneStart(v_index);
542  v_end = clonotype.getVarGeneEnd(v_index);
543 
544  for (seq_len_t i = 0; i < v_end - v_start + 2; ++i) {
545  probs(VARIABLE_DELETIONS_MATRIX_INDEX, v_index, 0, i) = _param_vec->event_prob(V_DEL, v_gene - 1, (1 + v_len) - (v_start + i)); // probability of deletions
546  }
547 
548  if (metadata_mode) {
549  if (clonotype.recombination() == VJ_RECOMB) {
550  // probability of choosing this V gene segment
551  for (seg_index_t j_index = 0; j_index < j_num; ++j_index) {
552  events(VARIABLE_GENES_MATRIX_INDEX, 0, v_index, j_index)
553  = _param_vec->event_index(VJ_VAR_JOI_GEN, 0, v_gene - 1, clonotype.getJoi(j_index) - 1);
554  }
555  } else if (clonotype.recombination() == VDJ_RECOMB) {
556  events(VARIABLE_GENES_MATRIX_INDEX, v_index, 0, 0) = _param_vec->event_index(VDJ_VAR_GEN, 0, v_gene - 1);
557  }
558 
559  for (seq_len_t i = 0; i < v_end - v_start + 2; ++i) {
560  events(VARIABLE_DELETIONS_MATRIX_INDEX, v_index, 0, i) = _param_vec->event_index(V_DEL, v_gene - 1, (1 + v_len) - (v_start + i));
561  }
562  }
563 
564  if (error_mode) {
565  errors(0, v_index, 0, 0) = 0;
566  for (seq_len_t i = 1; i <= clonotype.getVarLen(v_index); ++i) {
567  errors(0, v_index, 0, i) = errors(0, v_index, 0, i-1) + clonotype.isVarMismatch(v_index, i);
568  if (errors(0, v_index, 0, i)) {
569  probs(VARIABLE_DELETIONS_MATRIX_INDEX, v_index, 0, i) *= errors(0, v_index, 0, i) * _param_vec->error_prob();
570  }
571  }
572  }
573  }
574 
575  for (seq_len_t i = 0; i <= len; ++i) { seq_poses.push_back(i); }
576  }
577 
578 
588  void buildJoining(const Clonotype &clonotype,
589  ProbMMC &probs,
590  EventIndMMC &events,
591  ErrMMC &errors,
592  vector<seq_len_t> &seq_poses,
593  MetadataMode metadata_mode,
594  ErrorMode error_mode) const
595  {
596  int J_index_dels = JOINING_DELETIONS_VJ_MATRIX_INDEX,
597  J_index_genes = JOINING_GENES_VDJ_MATRIX_INDEX;
598  if (clonotype.recombination() == VDJ_RECOMB) {
599  J_index_dels = JOINING_DELETIONS_VDJ_MATRIX_INDEX;
600  }
601 
602  // find max J alignment
603  seg_index_t j_num = clonotype.nJoi();
604  seq_len_t len = 0, seq_global_start_pos = (seq_len_t) -1;
605  for (int j_index = 0; j_index < j_num; ++j_index) {
606  len = std::max(len, clonotype.getJoiLen(j_index));
607  seq_global_start_pos = std::min(seq_global_start_pos, clonotype.getJoiSeqStart(j_index));
608  }
609 
610  // add J deletions nodes
611  probs.initNode(J_index_dels, j_num, len + 1, 1);
612  if (metadata_mode) {
613  events.initNode(J_index_dels, j_num, len + 1, 1);
614  }
615  if (error_mode) {
616  errors.initNode(errors.chainSize() - 1, j_num, len + 1, 1);
617  }
618 
619  // add J or J-D gene nodes
620  if (clonotype.recombination() == VDJ_RECOMB) {
621  probs.initNode(J_index_genes, 1, j_num, clonotype.nDiv());
622  if (metadata_mode) {
623  events.initNode(J_index_genes, 1, j_num, clonotype.nDiv());
624  }
625  }
626 
627  // compute J deletions
628  seq_len_t j_len, j_gene, j_start, j_end, shift;
629 
630  EventClass J_DEL = clonotype.recombination() == VJ_RECOMB ? VJ_JOI_DEL : VDJ_JOI_DEL;
631  for (seg_index_t j_index = 0; j_index < j_num; ++j_index) {
632  // probability of choosing the J segment
633  j_gene = clonotype.getJoi(j_index);
634  j_len = _genes->J()[j_gene].sequence.size();
635 
636  if (clonotype.recombination() == VDJ_RECOMB) {
637  for (seg_index_t d_index = 0; d_index < clonotype.nDiv(); ++d_index) {
638  probs(J_index_genes, 0, j_index, d_index)
639  = _param_vec->event_prob(VDJ_JOI_DIV_GEN, 0, j_gene - 1, clonotype.getDiv(d_index) - 1); // probability of choosing this J gene segment with other D genes
640  }
641  }
642 
643  // J deletions
644  j_start = clonotype.getJoiGeneStart(j_index);
645  j_end = clonotype.getJoiGeneEnd(j_index);
646  shift = clonotype.getJoiSeqStart(j_index) - seq_global_start_pos;
647 
648  for (seq_len_t i = 0; i < clonotype.getJoiLen(j_index) + 1; ++i) {
649  probs(J_index_dels, j_index, i + shift, 0) = _param_vec->event_prob(J_DEL, j_gene - 1, j_start + i - 1); // probability of deletions
650  }
651 
652  if (metadata_mode) {
653  if (clonotype.recombination() == VDJ_RECOMB) {
654  for (seg_index_t d_index = 0; d_index < clonotype.nDiv(); ++d_index) {
655  events(J_index_genes, 0, j_index, d_index)
656  = _param_vec->event_index(VDJ_JOI_DIV_GEN, 0, j_gene - 1, clonotype.getDiv(d_index) - 1); // probability of choosing this J gene segment with other D genes
657  }
658  }
659 
660  for (seq_len_t i = 0; i < clonotype.getJoiLen(j_index) + 1; ++i) {
661  events(J_index_dels, j_index, i + shift, 0) = _param_vec->event_index(J_DEL, j_gene - 1, j_start + i - 1);
662  }
663  }
664 
665  if (error_mode) {
666  errors(errors.chainSize() - 1, j_index, len, 0) = 0;
667  for (seq_len_t i = 1; i <= clonotype.getJoiLen(j_index); ++i) {
668  errors(errors.chainSize() - 1, j_index, len - i, 0)
669  = errors(errors.chainSize() - 1, j_index, len + 1 - i, 0)
670  + clonotype.isJoiMismatch(j_index, clonotype.getJoiLen(j_index) + 1 - i);
671  if (errors(errors.chainSize() - 1, j_index, len - i, 0)) {
672  probs(J_index_dels, j_index, len - i, 0) *= errors(errors.chainSize() - 1, j_index, len - i, 0) * _param_vec->error_prob();
673  }
674  }
675  }
676  }
677 
678  for (seq_len_t i = clonotype.sequence().size() - len + 1; i <= clonotype.sequence().size() + 1; ++i) {
679  seq_poses.push_back(i);
680  }
681  }
682 
683 
693  void buildDiversity(const Clonotype &clonotype,
694  ProbMMC &probs,
695  EventIndMMC &events,
696  ErrMMC &errors,
697  vector<seq_len_t> &seq_poses,
698  MetadataMode metadata_mode,
699  ErrorMode error_mode) const
700  {
701  seq_len_t min_D_len;
702 
703  // vector seq_start -> 0 means no such index in the matrix, 1 otherwise.
704  seq_len_t seq_arr_size = clonotype.sequence().size() + 1;
705  unique_ptr<seq_len_t[]> seq_row(new seq_len_t[seq_arr_size]);
706  std::fill(seq_row.get(), seq_row.get() + seq_arr_size, 0);
707 
708  // vector seq_end -> 0 means no such index in the matrix, 1 otherwise.
709  unique_ptr<seq_len_t[]> seq_col(new seq_len_t[seq_arr_size]);
710  std::fill(seq_col.get(), seq_col.get() + seq_arr_size, 0);
711 
712  for (seg_index_t d_index = 0; d_index < clonotype.nDiv(); ++d_index) {
713  min_D_len = _param_vec->D_min_len(clonotype.getDiv(d_index));
714 
715  for (seg_index_t j = 0; j < clonotype.numDivAlignments(d_index); ++j) {
716  seq_len_t d_seq_start = clonotype.getDivSeqStart(d_index, j),
717  d_seq_end = clonotype.getDivSeqEnd(d_index, j);
718 
719  // yes-yes, I know that it could be done more efficiently. But I don't want to.
720  for (seq_len_t i = d_seq_start; i <= d_seq_end - min_D_len + 1; ++i) {
721  seq_row[i] = 1;
722  }
723 
724  for (seq_len_t i = d_seq_start + min_D_len - (seq_len_t) 1; i <= d_seq_end; ++i) {
725  seq_col[i] = 1;
726  }
727  }
728  }
729 
730  // make new vector seq_start -> 1based index in rows of Ddel matrices
731  seq_len_t seq_ind = 1;
732  for (seq_len_t i = 0; i < seq_arr_size; ++i) {
733  if (seq_row[i]) {
734  seq_row[i] = seq_ind;
735  ++seq_ind;
736  }
737  }
738 
739  // make new vector seq_end -> 1based index in columns of Ddel matrices
740  seq_ind = 1;
741  for (seq_len_t i = 0; i < seq_arr_size; ++i) {
742  if (seq_col[i]) {
743  seq_col[i] = seq_ind;
744  ++seq_ind;
745  }
746  }
747 
748  // find indices of D alignments and use only them to reduce memory usage.
749  seq_len_t last_max_seq_start = 0, last_max_seq_end = 0, seq_row_ind = 0, seq_col_ind = 0;
750 
751  seq_len_t seq_row_nonzeros = 0, seq_col_nonzeros = 0;
752  for (seq_len_t i = 0; i < seq_arr_size; ++i) {
753  seq_row_nonzeros += seq_row[i] != 0;
754  seq_col_nonzeros += seq_col[i] != 0;
755  }
756 
757  probs.initNode(DIVERSITY_GENES_MATRIX_INDEX, clonotype.nDiv(), seq_row_nonzeros, seq_col_nonzeros);
758  if (metadata_mode) {
759  events.initNode(DIVERSITY_GENES_MATRIX_INDEX, clonotype.nDiv(), seq_row_nonzeros, seq_col_nonzeros);
760  }
761  if (error_mode) {
762  errors.initNode(1, clonotype.nDiv(), seq_row_nonzeros, seq_col_nonzeros);
763  }
764 
765 
766  seg_index_t d_index, d_gene;
767  seq_len_t d_len;
768  seq_len_t d_seq_start, d_seq_end, d_gene_start, d_gene_end;
769 
770  for (seg_index_t d_index = 0; d_index < clonotype.nDiv(); ++d_index) {
771  d_gene = clonotype.getDiv(d_index);
772  d_len = _genes->D()[d_gene].sequence.size();
773  min_D_len = _param_vec->D_min_len(d_gene);
774 
775  // for each aligned Div segment get all possible smaller alignments and add them to the matrix.
776  for (seg_index_t j = 0; j < clonotype.numDivAlignments(d_index); ++j) {
777  d_seq_start = clonotype.getDivSeqStart(d_index, j);
778  d_seq_end = clonotype.getDivSeqEnd(d_index, j);
779  d_gene_start = clonotype.getDivGeneStart(d_index, j);
780  d_gene_end = clonotype.getDivGeneEnd(d_index, j);
781 
782  for (seq_len_t left_pos = d_seq_start; left_pos <= d_seq_end - min_D_len + 1; ++left_pos) {
783  for (seq_len_t right_pos = left_pos + min_D_len - 1; right_pos <= d_seq_end; ++right_pos) {
784  probs(DIVERSITY_GENES_MATRIX_INDEX, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1)
785  = _param_vec->event_prob(VDJ_DIV_DEL,
786  d_gene - 1,
787  d_gene_start + left_pos - d_seq_start,
788  d_len - (d_gene_end - (d_seq_end - right_pos)));
789  if (metadata_mode) {
790  events(DIVERSITY_GENES_MATRIX_INDEX, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1)
791  = _param_vec->event_index(VDJ_DIV_DEL,
792  d_gene - 1,
793  d_gene_start + left_pos - d_seq_start,
794  d_len - (d_gene_end - (d_seq_end - right_pos)));
795  }
796 
797  if (error_mode) {
798  errors(1, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1) =
799  clonotype.numDivMismatches(d_index, j,
800  d_gene_start + left_pos - d_seq_start,
801  d_gene_end - (d_seq_end - right_pos));
802 
803  if (errors(1, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1)) {
804  probs(DIVERSITY_GENES_MATRIX_INDEX, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1)
805  *= _param_vec->error_prob() * errors(1, d_index, seq_row[left_pos] - 1, seq_col[right_pos] - 1);
806  }
807  }
808  }
809  }
810  }
811  }
812 
813  // insert D3 and D5 positions
814  vector<seq_len_t> D35_poses;
815  D35_poses.reserve(seq_row_nonzeros + seq_col_nonzeros + 2);
816  for (seq_len_t i = 1; i < seq_arr_size; ++i) { if (seq_row[i]) { D35_poses.push_back(i); } }
817  for (seq_len_t i = 1; i < seq_arr_size; ++i) { if (seq_col[i]) { D35_poses.push_back(i); } }
818 
819  // Note! insert diversity gene seq poses BEFORE joining gene seq poses
820  seq_poses.reserve(seq_poses.size() + D35_poses.size() + 2); // +2 -> just in case (:
821  seq_poses.insert(seq_poses.begin() + probs.nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX), D35_poses.begin(), D35_poses.end());
822  }
823 
824 
834  void buildVJinsertions(const Clonotype &clonotype,
835  ProbMMC &probs,
836  EventIndMMC &events,
837  const vector<seq_len_t> &seq_poses,
838  MetadataMode metadata_mode,
839  ErrorMode error_mode) const
840  {
841  MonoNucInsertionModel mc(_param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_NUC, 0, 0)), error_mode ? _param_vec->error_prob() : 0);
842 
843  seq_len_t v_vertices = probs.nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
844  j_vertices = probs.nodeRows(JOINING_DELETIONS_VJ_MATRIX_INDEX);
845 
846  probs.initNode(VarJoi_INSERTIONS_MATRIX_INDEX, 1, v_vertices, j_vertices);
847 
848  if (metadata_mode) {
849  events.initNode(VarJoi_INSERTIONS_MATRIX_INDEX, 1, v_vertices, j_vertices);
850  }
851 
852  this->buildInsertions(clonotype.sequence(),
853  probs,
854  events,
855  seq_poses,
856  VarJoi_INSERTIONS_MATRIX_INDEX,
857  _param_vec->event_index(VJ_VAR_JOI_INS_LEN, 0, 0),
858  _param_vec->max_VJ_ins_len(),
859  metadata_mode,
860  error_mode,
861  0,
862  v_vertices - 1,
863  v_vertices,
864  v_vertices + j_vertices - 1,
865  mc,
866  false);
867  }
868 
869 
879  void buildVDinsertions(const Clonotype &clonotype,
880  ProbMMC &probs,
881  EventIndMMC &events,
882  const vector<seq_len_t> &seq_poses,
883  MetadataMode metadata_mode,
884  ErrorMode error_mode) const
885  {
886  DiNucInsertionModel mc(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC, 0, 0)), error_mode ? _param_vec->error_prob() : 0);
887 
888  seq_len_t v_vertices = probs.nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
889  d3_vertices = probs.nodeRows(DIVERSITY_GENES_MATRIX_INDEX);
890 
891  probs.initNode(VarDiv_INSERTIONS_MATRIX_INDEX, 1, v_vertices, d3_vertices);
892 
893  if (metadata_mode) {
894  events.initNode(VarDiv_INSERTIONS_MATRIX_INDEX, 1, v_vertices, d3_vertices);
895  }
896 
897  this->buildInsertions(clonotype.sequence(),
898  probs,
899  events,
900  seq_poses,
901  VarDiv_INSERTIONS_MATRIX_INDEX,
902  _param_vec->event_index(VDJ_VAR_DIV_INS_LEN, 0, 0),
903  _param_vec->max_VD_ins_len(),
904  metadata_mode,
905  error_mode,
906  0,
907  v_vertices - 1,
908  v_vertices,
909  v_vertices + d3_vertices - 1,
910  mc,
911  false);
912  }
913 
914 
924  void buildDJinsertions(const Clonotype &clonotype,
925  ProbMMC &probs,
926  EventIndMMC &events,
927  const vector<seq_len_t> &seq_poses,
928  MetadataMode metadata_mode,
929  ErrorMode error_mode) const
930  {
931  DiNucInsertionModel mc(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC, 0, 0)), error_mode ? _param_vec->error_prob() : 0);
932 
933  seq_len_t v_vertices = probs.nodeColumns(VARIABLE_DELETIONS_MATRIX_INDEX),
934  d3_vertices = probs.nodeRows(DIVERSITY_GENES_MATRIX_INDEX),
935  d5_vertices = probs.nodeColumns(DIVERSITY_GENES_MATRIX_INDEX),
936  j_vertices = probs.nodeRows(JOINING_DELETIONS_VDJ_MATRIX_INDEX);
937 
938  probs.initNode(DivJoi_INSERTIONS_MATRIX_INDEX, 1, d5_vertices, j_vertices);
939 
940  if (metadata_mode) {
941  events.initNode(DivJoi_INSERTIONS_MATRIX_INDEX, 1, d5_vertices, j_vertices);
942  }
943 
944  this->buildInsertions(clonotype.sequence(),
945  probs,
946  events,
947  seq_poses,
948  DivJoi_INSERTIONS_MATRIX_INDEX,
949  _param_vec->event_index(VDJ_DIV_JOI_INS_LEN, 0, 0),
950  _param_vec->max_DJ_ins_len(),
951  metadata_mode,
952  error_mode,
953  v_vertices + d3_vertices,
954  v_vertices + d3_vertices + d5_vertices - 1,
955  v_vertices + d3_vertices + d5_vertices,
956  v_vertices + d3_vertices + d5_vertices + j_vertices - 1,
957  mc,
958  true);
959  }
960 
961 
979  void buildInsertions(const string &sequence,
980  ProbMMC &probs,
981  EventIndMMC &events,
982  const vector<seq_len_t> &seq_poses,
983  ProbMMC::node_ind_t ins_node_index,
984  event_ind_t null_insertion,
985  seq_len_t max_size,
986  MetadataMode metadata_mode,
987  bool error_mode,
988  seq_len_t left_vertices_start,
989  seq_len_t left_vertices_end,
990  seq_len_t right_vertices_start,
991  seq_len_t right_vertices_end,
992  const AbstractInsertionModel& mc,
993  bool reversed = false) const
994  {
995  int insertion_len;
996  bool good_insertion;
997  char last_char = NULL_CHAR;
998 
999  for (size_t left_vertex_i = left_vertices_start; left_vertex_i <= left_vertices_end; ++left_vertex_i) {
1000  for (size_t right_vertex_i = right_vertices_start; right_vertex_i <= right_vertices_end; ++right_vertex_i) {
1001  insertion_len = seq_poses[right_vertex_i] - seq_poses[left_vertex_i] - 1;
1002  good_insertion = (insertion_len >= 0) && (insertion_len <= max_size);
1003  if (good_insertion) {
1004  if (!reversed) {
1005  if (seq_poses[left_vertex_i] == 0) {
1006  last_char = NULL_CHAR;
1007  } else {
1008  last_char = sequence[seq_poses[left_vertex_i] - 1];
1009  }
1010 
1011  probs(ins_node_index, 0, left_vertex_i - left_vertices_start, right_vertex_i - right_vertices_start)
1012  = mc.nucProbability(sequence.cbegin() + seq_poses[left_vertex_i],
1013  insertion_len,
1014  last_char,
1015  error_mode)
1016  * (*_param_vec)[null_insertion + insertion_len];
1017  } else {
1018  if (seq_poses[right_vertex_i] == sequence.size() + 1) {
1019  last_char = NULL_CHAR;
1020  } else {
1021  last_char = sequence[seq_poses[right_vertex_i] - 1];
1022  }
1023 
1024  probs(ins_node_index, 0, left_vertex_i - left_vertices_start, right_vertex_i - right_vertices_start)
1025  = mc.nucProbability(sequence.crbegin() + (sequence.size() - seq_poses[right_vertex_i] + 1),
1026  insertion_len,
1027  last_char,
1028  error_mode)
1029  * (*_param_vec)[null_insertion + insertion_len];
1030  }
1031 
1032  if (metadata_mode) {
1033  events(ins_node_index, 0, left_vertex_i - left_vertices_start, right_vertex_i - right_vertices_start)
1034  = null_insertion + insertion_len;
1035  }
1036  }
1037  }
1038  }
1039  }
1040 
1041 
1042  // build<VJ_RECOMB, SAVE_METADATA, NO_ERR>
1043  // put specific values (lambda) to a specific MMC
1044  // function for finding max V alignment
1045  // function for finding max J alignment
1046  // function for shrinking D alignment matrices, find non-zero positions, etc.
1047  // make seq_row, seq_col, seq_start, seq_ind vector (one function?)
1048  // build[Mono|Di]NucInsertions <InsertionModel, SequenceType, MetadataMode>
1049  // general functions for assigning values (event probs / event inds) to MMC of some type.
1050 
1051  /*
1052  buildVarGenesAndDels
1053  buildDivDels
1054  buildJoiDels
1055  buildJoiDivGenes
1056  */
1057 
1058  };
1059 }
1060 
1061 #endif //_MAAGBUILDER_H_
Definition: maagbuilder.h:56
pEventIndMMC _events
Definition: maag.h:455
unique_ptr< sequence_t > _sequence
Definition: maag.h:458
Definition: aligner.h:37
prob_t fullProbability(event_ind_t v_index, event_ind_t j_index) const
Compute and return the full assembling probability of this sequence (i.e., with all gene segments ali...
Definition: maag.h:249
prob_t buildAndCompute(const Clonotype &clonotype, ErrorMode error_mode, SequenceType seq_type=NUCLEOTIDE, MAAGComputeProbAction action=SUM_PROBABILITY) const
Compute generation probabilities without building the full information about MAAGs.
Definition: maagbuilder.h:250
Class for representing VJ / VD / DJ insertions models - either a mono-nucleotide or a di-nucleotide m...
Definition: insertionmodel.h:27
Definition: clonotype.h:46
SequenceType _seq_type
Definition: maag.h:460
MAAGBuilder()
Private default constructor.
Definition: maagbuilder.h:474
Definition: genesegment.h:265
void buildDJinsertions(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, const vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Diversity-Joining gene segments insertions.
Definition: maagbuilder.h:924
Multi-Alignment Assembly Graph - basic class for representing all possible generation scenarios of a ...
Definition: maag.h:46
seq_len_t numDivAlignments(seg_index_t index) const
Get the number of D gene alignments for the given D gene.
Definition: vdj_alignment.h:209
MAAG()
Default constructor.
Definition: maag.h:56
void updateEventProbabilities(MAAG *maag) const
Replace event probabilities in the given MAAGs if they have stored event indices. ...
Definition: maagbuilder.h:305
uint8_t node_ind_t
Node index type.
Definition: multimatrixchain.h:91
Class for storing parameters of assembling statistical model. Note: event with index 0 (zero) is "nul...
Definition: modelparametervector.h:68
void buildInsertions(const string &sequence, ProbMMC &probs, EventIndMMC &events, const vector< seq_len_t > &seq_poses, ProbMMC::node_ind_t ins_node_index, event_ind_t null_insertion, seq_len_t max_size, MetadataMode metadata_mode, bool error_mode, seq_len_t left_vertices_start, seq_len_t left_vertices_end, seq_len_t right_vertices_start, seq_len_t right_vertices_end, const AbstractInsertionModel &mc, bool reversed=false) const
General function for building insertions.
Definition: maagbuilder.h:979
virtual prob_t nucProbability(const std::string &sequence, char first_char=NULL_CHAR, bool with_errors=false) const =0
Probability of the given nucleotide sequence.
seg_index_t nVar() const
Get the number of alignments for the specific gene.
Definition: vdj_alignment.h:100
void buildDiversity(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, ErrMMC &errors, vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Diversity gene segments.
Definition: maagbuilder.h:693
seq_len_t getVarGeneStart(seg_index_t vgene) const
Get alignments for the specific gene.
Definition: vdj_alignment.h:124
void buildVJinsertions(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, const vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Variable-Joining gene segments insertions.
Definition: maagbuilder.h:834
seg_index_t getVar(size_t index) const
Get the index of the aligned gene segment for the specific gene.
Definition: vdj_alignment.h:112
Class for storing lists of matrices, where one node in the list (called "chain") could contain more t...
Definition: multimatrixchain.h:39
MAAGBuilder(const ModelParameterVector &param_vec, const VDJRecombinationGenes &genes)
Constructor for the builder from given vector with event probabilities and gene segments.
Definition: maagbuilder.h:66
MAAG build(const Clonotype &clonotype, MetadataMode metadata_mode, ErrorMode error_mode, SequenceType seq_type=NUCLEOTIDE) const
Build MAAGs from the given clonotypes.
Definition: maagbuilder.h:95
void buildVariable(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, ErrMMC &errors, vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Variable gene segments.
Definition: maagbuilder.h:486
Definition: repertoire.h:51
void buildJoining(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, ErrMMC &errors, vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Joining gene segments.
Definition: maagbuilder.h:588
pErrMMC _errors
Definition: maag.h:456
Definition: insertionmodel.h:321
void buildVDinsertions(const Clonotype &clonotype, ProbMMC &probs, EventIndMMC &events, const vector< seq_len_t > &seq_poses, MetadataMode metadata_mode, ErrorMode error_mode) const
Build probability and events matrices for Variable-Diversity gene segments insertions.
Definition: maagbuilder.h:879
unique_ptr< seq_len_t[]> _seq_poses
Definition: maag.h:457
Definition: insertionmodel.h:183
const sequence_t & sequence() const
Get the sequence of this Clonotype.
Definition: clonotype.h:90