Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
clonotypeassembler.h
1 //
2 // Created by Vadim N. on 24/03/2015.
3 //
4 
5 #ifndef _YMIR_CLONOTYPEASSEMBLER_H_
6 #define _YMIR_CLONOTYPEASSEMBLER_H_
7 
8 
9 #include <chrono>
10 
11 #include "insertionmodel.h"
12 #include "clonotype_builder.h"
13 
14 
15 namespace ymir {
16 
18 
19  public:
20 
24  ClonotypeAssembler(const ModelParameterVector &param_vec, const VDJRecombinationGenes &genes)
25  : _param_vec(param_vec), _genes(genes)
26  { }
27 
28 
32  virtual ~ClonotypeAssembler() { }
33 
34 
35  void updateModelParameterVector(const ModelParameterVector &param_vec) {
36  _param_vec = param_vec;
37  }
38 
39 
43  Cloneset generate(size_t count = 1, bool verbose = true) const {
44  std::vector<Clonotype> vec;
45  vec.reserve(count);
46 
47  unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
48  std::default_random_engine rg(seed);
49 
50  if (_param_vec.recombination() == VJ_RECOMB) {
51  if (verbose) {
52  std::cout << "Generating " << (size_t) count << " sequences." << std::endl;
53  }
54 
55  for (size_t clonotype_i = 0; clonotype_i < count; ++clonotype_i) {
56  vec.push_back(this->generate_vj(rg));
57 
58  if (verbose && (clonotype_i + 1) % 50000 == 0) {
59  cout << "Generated " << (size_t) (clonotype_i + 1) << "/" << (size_t) count << " sequences." << endl;
60  }
61  }
62 
63  if (verbose) {
64  cout << "Generated " << (size_t) count << "/" << (size_t) count << " sequences." << endl;
65  }
66  }
67  else if (_param_vec.recombination() == VDJ_RECOMB) {
68  if (verbose) {
69  std::cout << "Generating " << (size_t) count << " sequences." << std::endl;
70  }
71 
72  for (size_t clonotype_i = 0; clonotype_i < count; ++clonotype_i) {
73  vec.push_back(this->generate_vdj(rg));
74 
75  if (verbose && (clonotype_i + 1) % 50000 == 0) {
76  cout << "Generated " << (size_t) (clonotype_i + 1) << "/" << (size_t) count << " sequences." << endl;
77  }
78  }
79 
80  if (verbose) {
81  cout << "Generated " << (size_t) count << "/" << (size_t) count << " sequences." << endl;
82  }
83  } else {
84  std::cout << "Unrecognised recombination type of the input model." << std::endl;
85  }
86 
87  return Cloneset(vec);
88  }
89 
90  protected:
91 
92  ModelParameterVector _param_vec;
93  VDJRecombinationGenes _genes;
94 
95 
96  Clonotype generate_vj(std::default_random_engine &rg) const {
97  ClonotypeBuilder builder;
98  MonoNucInsertionModel mc_vj(_param_vec.get_iterator(_param_vec.event_index(VJ_VAR_JOI_INS_NUC, 0, 0)));
99 
100  std::discrete_distribution<event_ind_t> vj_genes(_param_vec.get_iterator(_param_vec.event_index(VJ_VAR_JOI_GEN, 0, 0)),
101  _param_vec.get_iterator(_param_vec.event_index(VJ_VAR_JOI_GEN, 0, 0)
102  + _param_vec.eventClassSize(VJ_VAR_JOI_GEN)));
103  event_ind_t v_j = vj_genes(rg);
104 
105  seg_index_t vgene = v_j / _param_vec.n_columns(VJ_VAR_JOI_GEN) + 1;
106 
107  seg_index_t jgene = v_j - (v_j / _param_vec.n_columns(VJ_VAR_JOI_GEN)) * (_param_vec.n_columns(VJ_VAR_JOI_GEN)) + 1;
108 
109  seq_len_t v_del_num = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VJ_VAR_DEL, vgene - 1, 0)),
110  _param_vec.get_iterator(_param_vec.event_index(VJ_VAR_DEL, vgene - 1, 0)
111  + _param_vec.eventFamilySize(VJ_VAR_DEL, vgene - 1)))(rg);
112  // TODO: test it
113  builder.addVarAlignment(vgene, 1, 1, _genes.V()[vgene].sequence.size() - v_del_num);
114 
115  seq_len_t ins_len = std::discrete_distribution<seq_len_t>(_param_vec.get_iterator(_param_vec.event_index(VJ_VAR_JOI_INS_LEN, 0, 0)),
116  _param_vec.get_iterator(_param_vec.event_index(VJ_VAR_JOI_INS_LEN, 0, 0)
117  + _param_vec.eventClassSize(VJ_VAR_JOI_INS_LEN)))(rg);
118 
119  seq_len_t j_del_num = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VJ_JOI_DEL, jgene - 1, 0)),
120  _param_vec.get_iterator(_param_vec.event_index(VJ_JOI_DEL, jgene - 1, 0)
121  + _param_vec.eventFamilySize(VJ_JOI_DEL, jgene - 1)))(rg);
122  // TODO: test it
123  builder.addJoiAlignment(jgene, 1, _genes.V()[vgene].sequence.size() - v_del_num + ins_len + 1, _genes.J()[jgene].sequence.size() - j_del_num);
124 
125  builder.setSequence(_genes.V()[vgene].sequence.substr(0, _genes.V()[vgene].sequence.size() - v_del_num)
126  + mc_vj.generate(ins_len, rg)
127  + _genes.J()[jgene].sequence.substr(j_del_num));
128  builder.setNucleotideSeq();
129  return builder.buildClonotype();
130  }
131 
132 
133  Clonotype generate_vdj(std::default_random_engine &rg) const {
134  ClonotypeBuilder builder;
135  const DiNucInsertionModel mc_vd(_param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_DIV_INS_NUC, 0, 0)));
136  const DiNucInsertionModel mc_dj(_param_vec.get_iterator(_param_vec.event_index(VDJ_DIV_JOI_INS_NUC, 0, 0)));
137 
138  seq_len_t ins_len_vd = std::discrete_distribution<seq_len_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_DIV_INS_LEN, 0, 0)),
139  _param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_DIV_INS_LEN, 0, 0)
140  + _param_vec.eventClassSize(VDJ_VAR_DIV_INS_LEN)))(rg);
141  seq_len_t ins_len_dj = std::discrete_distribution<seq_len_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_DIV_JOI_INS_LEN, 0, 0)),
142  _param_vec.get_iterator(_param_vec.event_index(VDJ_DIV_JOI_INS_LEN, 0, 0)
143  + _param_vec.eventClassSize(VDJ_DIV_JOI_INS_LEN)))(rg);
144 
145  seg_index_t vgene = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_GEN, 0, 0)),
146  _param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_GEN, 0, 0)
147  + _param_vec.eventClassSize(VDJ_VAR_GEN)))(rg) + 1;
148 
149  std::discrete_distribution<event_ind_t> jd_genes(_param_vec.get_iterator(_param_vec.event_index(VDJ_JOI_DIV_GEN, 0, 0)),
150  _param_vec.get_iterator(_param_vec.event_index(VDJ_JOI_DIV_GEN, 0, 0)
151  + _param_vec.eventClassSize(VDJ_JOI_DIV_GEN)));
152  event_ind_t j_d = jd_genes(rg);
153 
154  seg_index_t jgene = j_d / _param_vec.n_columns(VDJ_JOI_DIV_GEN) + 1;
155 
156  seg_index_t dgene = j_d - (j_d / _param_vec.n_columns(VDJ_JOI_DIV_GEN)) * _param_vec.n_columns(VDJ_JOI_DIV_GEN) + 1;
157 
158  seq_len_t v_del_num = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_DEL, vgene - 1, 0)),
159  _param_vec.get_iterator(_param_vec.event_index(VDJ_VAR_DEL, vgene - 1, 0)
160  + _param_vec.eventFamilySize(VDJ_VAR_DEL, vgene - 1)))(rg);
161  std::string vgene_del = _genes.V()[vgene].sequence.substr(0, _genes.V()[vgene].sequence.size() - v_del_num);
162  // TODO: test it
163  builder.addVarAlignment(vgene, 1, 1, vgene_del.size());
164 
165  event_ind_t d_del_index = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_DIV_DEL, dgene - 1, 0)),
166  _param_vec.get_iterator(_param_vec.event_index(VDJ_DIV_DEL, dgene - 1, 0)
167  + _param_vec.eventFamilySize(VDJ_DIV_DEL, dgene - 1)))(rg);
168  seq_len_t d5_del_num = d_del_index / _param_vec.n_columns(VDJ_DIV_DEL, 0);
169  seq_len_t d3_del_num = d_del_index - (d_del_index / _param_vec.n_columns(VDJ_DIV_DEL, 0)) * _param_vec.n_columns(VDJ_DIV_DEL, 0);
170  std::string dgene_del = _genes.D()[dgene].sequence.substr(d5_del_num, _genes.D()[dgene].sequence.size() - d3_del_num - d5_del_num);
171  // TODO: test it
172  builder.addDivAlignment(dgene,
173  d3_del_num + 1,
174  vgene_del.size() + ins_len_vd + 1,
175  dgene_del.size());
176 
177  seq_len_t j_del_num = std::discrete_distribution<event_ind_t>(_param_vec.get_iterator(_param_vec.event_index(VDJ_JOI_DEL, jgene - 1, 0)),
178  _param_vec.get_iterator(_param_vec.event_index(VDJ_JOI_DEL, jgene - 1, 0)
179  + _param_vec.eventFamilySize(VDJ_JOI_DEL, jgene - 1)))(rg);
180  std::string jgene_del = _genes.J()[jgene].sequence.substr(j_del_num);
181  // TODO: test it
182  builder.addJoiAlignment(jgene, 1, vgene_del.size() + ins_len_vd + ins_len_dj + dgene_del.size() + 1, jgene_del.size());
183 
184  char last_v_char = vgene_del.size() ? vgene_del[vgene_del.size() - 1] : NULL_CHAR;
185  char first_j_char = jgene_del.size() ? jgene_del[0]: NULL_CHAR;
186 
187  builder.setSequence(vgene_del
188  + mc_vd.generate(ins_len_vd, rg, last_v_char)
189  + dgene_del
190  + mc_dj.generate(ins_len_dj, rg, first_j_char, true)
191  + jgene_del);
192 
193  builder.setNucleotideSeq();
194  return builder.buildClonotype();
195  }
196 
197  };
198 
199 }
200 
201 #endif //_YMIR_CLONOTYPEASSEMBLER_H_
VDJAlignmentBuilder & addVarAlignment(seg_index_t vseg, seq_len_t vstart, seq_len_t seqstart, seq_len_t alignment_len)
Add singular alignments to the builder.
Definition: vdj_alignment_builder.h:86
Definition: aligner.h:37
event_ind_t event_index(EventClass event_class, event_ind_t event_family, event_ind_t event_index) const
Definition: modelparametervector.h:261
Definition: clonotype.h:46
Definition: genesegment.h:265
Class for storing parameters of assembling statistical model. Note: event with index 0 (zero) is "nul...
Definition: modelparametervector.h:68
Definition: clonotype_builder.h:37
Definition: repertoire.h:171
Clonotype buildClonotype()
Build clone alignment structure with stored information.
Definition: clonotype_builder.h:55
Definition: insertionmodel.h:321
Definition: insertionmodel.h:183
Definition: clonotypeassembler.h:17