Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
model_parser.h
1 //
2 // Created by Vadim N. on 01/03/2016.
3 //
4 
5 #ifndef YMIR_MODEL_PARSER_H
6 #define YMIR_MODEL_PARSER_H
7 
8 
9 #include "textdata.h"
10 #include "genesegment.h"
11 
12 
13 namespace ymir {
14 
15  class ModelParser {
16 
17  public:
18 
19  ModelParser(const std::string &model_path, Json::Value config, ModelBehaviour behav)
20  : _model_path(model_path),
21  _config(config),
22  _behaviour(behav)
23  {
24  }
25 
26 
27  virtual ~ModelParser()
28  {
29  }
30 
31 
32  bool parse() {
33  return this->parseGeneSegments()
34  && this->makeEventProbabilitiesVector();
35  }
36 
37 
38  virtual bool parseGeneSegments() = 0;
39 
40 
41  bool makeEventProbabilitiesVector() {
42  if (_behaviour == EMPTY) {
43  return this->createEventProbabilitiesFromScratch();
44  } else {
45  return this->parseEventProbabilitiesFromFiles();
46  }
47  }
48 
49 
50  void swap_genes(unique_ptr<VDJRecombinationGenes> &ptr) {
51  _genes.swap(ptr);
52  }
53 
54  void swap_parameters(unique_ptr<ModelParameterVector> &ptr) {
55  _param_vec.swap(ptr);
56  }
57 
58  protected:
59 
60  Json::Value _config;
61  std::string _model_path;
62  ModelBehaviour _behaviour;
63  unique_ptr<VDJRecombinationGenes> _genes;
64  unique_ptr<ModelParameterVector> _param_vec;
65  seq_len_t _min_D_len;
66 
67 
68  bool createEventProbabilitiesFromScratch() {
69  vector<AbstractTDContainer*> containers;
70  containers.resize(10, nullptr);
71 
72  this->createContainers(containers);
73 
74  bool is_ok = this->makeModelParameterVector(containers);
75 
76  _param_vec->fill(1);
77  _param_vec->normaliseEventFamilies();
78 
79  if (!is_ok) { throw(runtime_error("WRONG EMPTY VECTOR CREATING SUBROUTINE!!!")); }
80 
81  return is_ok;
82  }
83 
84 
85  virtual void createContainers(vector<AbstractTDContainer*> &containers) = 0;
86 
87 
88  bool parseEventProbabilitiesFromFiles() {
89  Json::Value pt = _config.get("probtables", "no-prob");
90  if (pt.size()) {
91  AbstractTDContainer *container = nullptr;
92  string element = "", err_message = "";
93  vector<AbstractTDContainer*> containers;
94  containers.resize(10, nullptr);
95 
96  // Parse tables with event probabilities.
97  for (Json::ArrayIndex i = 0; i < pt.size(); ++i) {
98  element = pt.getMemberNames()[i];
99  container = read_textdata(_model_path + pt[element]["file"].asString(),
100  pt[element]["type"].asString(),
101  pt[element].get("skip.first.column", true).asBool(),
102  pt[element].get("laplace", .0).asDouble(),
103  err_message);
104 
105  this->parseDataContainer(element, container, containers);
106  }
107 
108  return this->makeModelParameterVector(containers);
109  }
110 
111  std::cout << "[ERROR] No information about probability events in the model .json file found." << std::endl;
112  return false;
113  }
114 
115 
116  virtual void parseDataContainer(const string &element, AbstractTDContainer *container, vector<AbstractTDContainer*> &containers) = 0;
117 
118 
119  bool makeModelParameterVector(vector<AbstractTDContainer*> &containers) {
120  // Made ModelParameterVector from input tables if all is ok.
121  vector<prob_t> event_probs; // param vec
122  vector<event_ind_t> event_lengths; // lens vec
123  vector<event_ind_t> event_classes; // event classes
124  vector<seq_len_t> event_col_num; // event family col numbers
125  vector<prob_t> laplace;
126  vector<seq_len_t> min_D_len_vec;
127 
128  bool is_ok = this->makeModelParameterVector(containers, event_probs, event_lengths, event_classes, event_col_num, laplace, min_D_len_vec);
129 
130  // Free all memory for containers.
131  for (size_t i = 0; i < containers.size(); ++i) {
132  if (containers[i]) { delete containers[i]; }
133  }
134 
135  return is_ok;
136  }
137 
138 
139  virtual bool makeModelParameterVector(vector<AbstractTDContainer*> &containers,
140  vector<prob_t> &event_probs,
141  vector<event_ind_t> &event_lengths,
142  vector<event_ind_t> &event_classes,
143  vector<seq_len_t> &event_col_num,
144  vector<prob_t> &laplace,
145  vector<seq_len_t> &min_D_len_vec) = 0;
146 
147 
151  bool findGenes(const vector<string> &names, const GeneSegmentAlphabet &gsa, string &err_message) const {
152  unordered_set<string> nameset;
153 
154  for (size_t i = 0; i < names.size(); ++i) {
155  if (gsa[names[i]].index == 0) {
156  err_message = "ERROR: can't find '" + names[i] + "' in gene segments.";
157  return false;
158  }
159  nameset.insert(names[i]);
160  }
161 
162  for (size_t i = 1; i <= gsa.max(); ++i) {
163  if (nameset.count(gsa[i].allele) == 0) {
164  err_message = "ERROR: can't find '" + gsa[i].allele + "' in this file.";
165  return false;
166  }
167  }
168 
169  return true;
170  }
171 
172 
176  vector<seg_index_t> arrangeNames(const vector<string> &names, const GeneSegmentAlphabet &gsa) const {
177  vector<seg_index_t> res;
178  res.resize(names.size(), 0);
179 
180  for (size_t i = 0; i < names.size(); ++i) { res[i] = gsa[names[i]].index; }
181 
182  return res;
183  }
184 
185 
186  void addGenes(AbstractTDContainer *container,
187  const GeneSegmentAlphabet &gsa,
188  vector<prob_t> &event_probs,
189  vector<event_ind_t> &event_lengths,
190  vector<event_ind_t> &event_classes,
191  vector<seq_len_t> &event_col_num,
192  vector<prob_t> &laplace) const {
193  vector<seg_index_t> name_order = this->arrangeNames(container->row_names(), gsa);
194  vector<prob_t> prob_data;
195  prob_data.resize(container->data(0).size(), 0);
196  for (size_t i = 0; i < name_order.size(); ++i) {
197  prob_data[name_order[i] - 1] = container->data(0)[i];
198  }
199  event_probs.insert(event_probs.end(),
200  prob_data.begin(),
201  prob_data.end());
202  event_lengths.push_back(prob_data.size());
203  event_col_num.push_back(0);
204  laplace.push_back(container->laplace());
205 
206  event_classes.push_back(0);
207  }
208 
209 
210  void addGenes(AbstractTDContainer *container,
211  const GeneSegmentAlphabet &gsa_row,
212  const GeneSegmentAlphabet &gsa_column,
213  vector<prob_t> &event_probs,
214  vector<event_ind_t> &event_lengths,
215  vector<event_ind_t> &event_classes,
216  vector<seq_len_t> &event_col_num,
217  vector<prob_t> &laplace,
218  seg_index_t prev_class_size) const {
219  vector<seg_index_t> name_order_row = this->arrangeNames(container->row_names(), gsa_row);
220  vector<seg_index_t> name_order_column = this->arrangeNames(container->column_names(), gsa_column);
221  vector<prob_t> prob_data = container->data(0), sorted_prob_data;
222  sorted_prob_data.resize(prob_data.size(), 0);
223  for (size_t i = 0; i < name_order_row.size(); ++i) {
224  for (size_t j = 0; j < name_order_column.size(); ++j) {
225  sorted_prob_data[(name_order_row[i] - 1) * container->n_columns() + (name_order_column[j] - 1)] = prob_data[i * container->n_columns() + j];
226  }
227  }
228  event_probs.insert(event_probs.end(),
229  sorted_prob_data.begin(),
230  sorted_prob_data.end());
231  event_lengths.push_back(sorted_prob_data.size());
232  event_col_num.push_back(container->n_columns());
233  laplace.push_back(container->laplace());
234 
235  if (prev_class_size) {
236  event_classes.push_back(event_classes[event_classes.size() - 1] + prev_class_size);
237  } else {
238  event_classes.push_back(0);
239  }
240  }
241 
242 
243  void addDels(AbstractTDContainer *container,
244  const GeneSegmentAlphabet &gsa,
245  vector<prob_t> &event_probs,
246  vector<event_ind_t> &event_lengths,
247  vector<event_ind_t> &event_classes,
248  vector<seq_len_t> &event_col_num,
249  vector<prob_t> &laplace,
250  seg_index_t prev_class_size) const {
251  vector<seg_index_t> name_order = this->arrangeNames(container->column_names(), gsa);
252  vector<prob_t> prob_data;
253  for (size_t i = 0; i < name_order.size(); ++i) {
254  // find correct segment for i-th position
255  size_t j = 0;
256  for (; i+1 != name_order[j] ; ++j) {}
257  prob_data = container->data(j);
258 
259  // add trailing zeros if distribution is smaller than a gene length
260  if (prob_data.size() < gsa[name_order[j]].sequence.size() + 1) {
261  prob_data.resize(gsa[name_order[j]].sequence.size() + 1, 0);
262  }
263 
264  // remove trailing zeros
265  if (prob_data.size() > gsa[name_order[j]].sequence.size() + 1) {
266  prob_data.resize(gsa[name_order[j]].sequence.size() + 1);
267  }
268 
269  event_probs.insert(event_probs.end(),
270  prob_data.begin(),
271  prob_data.end());
272  event_lengths.push_back(prob_data.size());
273  event_col_num.push_back(0);
274  laplace.push_back(container->laplace());
275  }
276  event_classes.push_back(event_classes[event_classes.size() - 1] + prev_class_size);
277  }
278 
279 
280  void addDels2D(AbstractTDContainer *container,
281  const GeneSegmentAlphabet &gsa,
282  vector<prob_t> &event_probs,
283  vector<event_ind_t> &event_lengths,
284  vector<event_ind_t> &event_classes,
285  vector<seq_len_t> &event_col_num,
286  vector<prob_t> &laplace,
287  seg_index_t prev_class_size) const {
288  vector<seg_index_t> name_order = this->arrangeNames(container->row_names(), gsa);
289  vector<prob_t> prob_data;
290  for (size_t i = 0; i < name_order.size(); ++i) {
291  // find correct segment for i-th position
292  size_t j = 0;
293  for (; i+1 != name_order[j] ; ++j) {}
294  prob_data = container->data(j);
295 
296  if (prob_data.size() > gsa[name_order[j]].sequence.size() + 1) {
297  vector<prob_t> new_prob_data((gsa[name_order[j]].sequence.size() + 1) * (gsa[name_order[j]].sequence.size() + 1));
298  for (auto row_i = 0; row_i < gsa[name_order[j]].sequence.size() + 1; ++row_i) {
299  for (auto col_i = 0; col_i < gsa[name_order[j]].sequence.size() + 1; ++col_i) {
300  new_prob_data[row_i * (gsa[name_order[j]].sequence.size() + 1) + col_i] = prob_data[row_i * (gsa[name_order[j]].sequence.size() + 1) + col_i];
301  }
302  }
303  prob_data = new_prob_data;
304  }
305 
306  event_probs.insert(event_probs.end(),
307  prob_data.begin(),
308  prob_data.end());
309  event_lengths.push_back(prob_data.size());
310  event_col_num.push_back(container->metadata(j));
311  laplace.push_back(container->laplace());
312  }
313  event_classes.push_back(event_classes[event_classes.size() - 1] + prev_class_size);
314  }
315 
316 
317  void addIns(AbstractTDContainer *container,
318  vector<prob_t> &event_probs,
319  vector<event_ind_t> &event_lengths,
320  vector<event_ind_t> &event_classes,
321  vector<seq_len_t> &event_col_num,
322  vector<prob_t> &laplace,
323  seg_index_t prev_class_size,
324  seq_len_t max_ins_len = 0) const {
325  vector<prob_t> prob_data;
326  for (size_t i = 0; i < container->n_columns(); ++i) {
327  prob_data = container->data(i);
328  if (max_ins_len) { prob_data.resize(max_ins_len); }
329  event_probs.insert(event_probs.end(),
330  prob_data.begin(),
331  prob_data.end());
332  event_lengths.push_back(prob_data.size());
333  event_col_num.push_back(0);
334  laplace.push_back(container->laplace());
335  event_classes.push_back(event_classes[event_classes.size() - 1] + prev_class_size);
336  prev_class_size = 1;
337  }
338  }
339 
340  };
341 
342 
343  class VJModelParser : public ModelParser {
344  public:
345 
346  VJModelParser(const std::string &model_path, Json::Value config, ModelBehaviour behav)
347  : ModelParser(model_path, config, behav)
348  {
349  }
350 
351 
352  bool parseGeneSegments() {
353  cout << "\tV gene seg.: ";
354  if (_config.get("segments", Json::Value("")).get("variable", Json::Value("")).size() == 0) {
355  cout << "ERROR: no gene segments file in the model's .json." << endl;
356 
357  return false;
358  } else {
359  cout << "OK" << endl;
360  }
361  string v_path = _model_path + _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("file", "").asString();
362 
363  cout << "\tJ gene seg.: ";
364  if (_config.get("segments", Json::Value("")).get("joining", Json::Value("")).size() == 0) {
365  cout << "ERROR: no gene segments file in the model's .json." << endl;
366 
367  return false;
368  } else {
369  cout << "OK" << endl;
370  }
371  string j_path = _model_path + _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("file", "").asString();
372 
373  bool vok, jok;
374 
375  _genes.reset(new VDJRecombinationGenes("VJ.V", v_path, "VJ.J", j_path, &vok, &jok));
376 
377  if (vok && jok) {
378  _genes->appendPalindromicNucleotides(VARIABLE,
379  _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("P.nuc.3'", 0).asUInt(),
380  _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("P.nuc.5'", 0).asUInt());
381  _genes->appendPalindromicNucleotides(JOINING,
382  _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("P.nuc.3'", 0).asUInt(),
383  _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("P.nuc.5'", 0).asUInt());
384  }
385 
386  return vok && jok;
387  }
388 
389  protected:
390 
391  virtual void createContainers(vector<AbstractTDContainer*> &containers) {
392  AbstractTDContainer* container;
393 
394  // V-J
395  container = new TDMatrix(true, _config.get("probtables", Json::Value()).get("v.j", Json::Value()).get("laplace", .0).asDouble());
396  for (auto i = 1; i <= _genes->V().max(); ++i){
397  container->addRowName(_genes->V()[i].allele);
398  }
399  for (auto i = 1; i <= _genes->J().max(); ++i){
400  container->addColumnName(_genes->J()[i].allele);
401  }
402  container->addDataVector(vector<prob_t>(_genes->V().max() * _genes->J().max()));
403  containers[VJ_VAR_JOI_GEN] = container;
404 
405  // V del
406  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("v.del", Json::Value()).get("laplace", .0).asDouble());
407  for (auto i = 1; i <= _genes->V().max(); ++i) {
408  container->addColumnName(_genes->V()[i].allele);
409  container->addDataVector(vector<prob_t>(_genes->V()[i].sequence.size() + 1));
410  }
411  containers[VJ_VAR_DEL] = container;
412 
413  // J del
414  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("j.del", Json::Value()).get("laplace", .0).asDouble());
415  for (auto i = 1; i <= _genes->J().max(); ++i){
416  container->addColumnName(_genes->J()[i].allele);
417  container->addDataVector(vector<prob_t>(_genes->J()[i].sequence.size() + 1));
418  }
419  containers[VJ_JOI_DEL] = container;
420 
421  // VJ ins
422  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("laplace", .0).asDouble());
423  container->addColumnName("VJ ins len");
424  container->addDataVector(vector<prob_t>(_config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("max.len", DEFAULT_MAX_INS_LENGTH).asUInt64() + 1));
425  containers[VJ_VAR_JOI_INS_LEN] = container;
426 
427  // VJ nuc
428  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("ins.nucl", Json::Value()).get("laplace", .0).asDouble());
429  container->addColumnName("VJ nucs");
430  container->addDataVector(vector<prob_t>(4));
431  containers[VJ_VAR_JOI_INS_NUC] = container;
432  }
433 
434 
435  virtual void parseDataContainer(const string &element, AbstractTDContainer *container, vector<AbstractTDContainer*> &containers) {
436  std::string err_message = "NOT FOUND";
437  if (element == "v.j") {
438  if (container
439  && container->file_exists()
440  && this->findGenes(container->column_names(), _genes->J(), err_message)
441  && this->findGenes(container->row_names(), _genes->V(), err_message))
442  {
443  containers[VJ_VAR_JOI_GEN] = container;
444  err_message = "OK";
445  }
446 
447  cout << "\tV-J gene pairs: " << err_message << endl;
448  }
449  else if (element == "v.del") {
450  if (container
451  && container->file_exists()
452  && this->findGenes(container->column_names(), _genes->V(), err_message))
453  {
454  containers[VJ_VAR_DEL] = container;
455  err_message = "OK";
456  }
457 
458  cout << "\tV delet. num.: " << err_message << endl;
459  }
460  else if (element == "j.del") {
461  if (container
462  && container->file_exists()
463  && this->findGenes(container->column_names(), _genes->J(), err_message))
464  {
465  containers[VJ_JOI_DEL] = container;
466  err_message = "OK";
467  }
468 
469  cout << "\tJ delet. num.: " << err_message << endl;
470  }
471  else if (element == "ins.len") {
472  if (container && container->file_exists()) {
473  if (container->n_columns() != 1) {
474  stringstream ss;
475  ss << "ERROR: wrong number of columns (expected: 1, got: " << (int) container->n_columns() << ")";
476  err_message = ss.str();
477  } else {
478  containers[VJ_VAR_JOI_INS_LEN] = container;
479  err_message = "OK";
480  }
481  }
482 
483  cout << "\tVJ ins. len.: " << err_message << endl;
484  }
485  else if (element == "ins.nucl") {
486  if (container && container->file_exists()) {
487  if (container->n_rows() != 4 || container->n_columns() != 1) {
488  stringstream ss;
489  ss << "ERROR: wrong number of columns and rows (expected: 4 X 1, got: " << (int) container->n_rows() << " X " << (int) container->n_columns() << ")";
490  err_message = ss.str();
491  } else {
492  containers[VJ_VAR_JOI_INS_NUC] = container;
493  err_message = "OK";
494  }
495  }
496 
497  cout << "\tVJ ins. nuc.: " << err_message << endl;
498  }
499  else { cerr << "Unrecognised element in \'probtables\'" << ":\n\t" << element << endl; }
500  }
501 
502 
503  virtual bool makeModelParameterVector(vector<AbstractTDContainer*> &containers,
504  vector<prob_t> &event_probs,
505  vector<event_ind_t> &event_lengths,
506  vector<event_ind_t> &event_classes,
507  vector<seq_len_t> &event_col_num,
508  vector<prob_t> &laplace,
509  vector<seq_len_t> &min_D_len_vec)
510  {
511  bool is_ok = false;
512 
513  if (containers[VJ_VAR_JOI_GEN]
514  && containers[VJ_VAR_DEL]
515  && containers[VJ_JOI_DEL]
516  && containers[VJ_VAR_JOI_INS_LEN]
517  && containers[VJ_VAR_JOI_INS_NUC]) {
518 
519  this->addGenes(containers[VJ_VAR_JOI_GEN],
520  _genes->V(),
521  _genes->J(),
522  event_probs,
523  event_lengths,
524  event_classes,
525  event_col_num,
526  laplace,
527  0);
528 
529  this->addDels(containers[VJ_VAR_DEL],
530  _genes->V(),
531  event_probs,
532  event_lengths,
533  event_classes,
534  event_col_num,
535  laplace,
536  1);
537 
538  this->addDels(containers[VJ_JOI_DEL],
539  _genes->J(),
540  event_probs,
541  event_lengths,
542  event_classes,
543  event_col_num,
544  laplace,
545  containers[VJ_VAR_DEL]->n_columns());
546 
547  this->addIns(containers[VJ_VAR_JOI_INS_LEN],
548  event_probs,
549  event_lengths,
550  event_classes,
551  event_col_num,
552  laplace,
553  containers[VJ_JOI_DEL]->n_columns(),
554  _config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("max.len", DEFAULT_MAX_INS_LENGTH).asUInt64() + 1);
555 
556  this->addIns(containers[VJ_VAR_JOI_INS_NUC],
557  event_probs,
558  event_lengths,
559  event_classes,
560  event_col_num,
561  laplace,
562  1);
563 
564  _param_vec.reset(new ModelParameterVector(VJ_RECOMB, event_probs, event_lengths, event_classes, event_col_num, laplace, _config.get("errors", 0).asDouble()));
565  is_ok = true;
566  }
567 
568  return is_ok;
569  }
570 
571  };
572 
573 
574  class VDJModelParser : public ModelParser {
575 
576  public:
577 
578  VDJModelParser(const std::string &model_path, Json::Value config, ModelBehaviour behav)
579  : ModelParser(model_path, config, behav)
580  {
581  }
582 
583 
584  bool parseGeneSegments() {
585  cout << "\tV gene seg.: ";
586  if (_config.get("segments", Json::Value("")).get("variable", Json::Value("")).size() == 0) {
587  cout << "ERROR: no gene segments file in the model's .json." << endl;
588 
589  return false;
590  } else {
591  cout << "OK" << endl;
592  }
593  string v_path = _model_path + _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("file", "").asString();
594 
595  cout << "\tJ gene seg.: ";
596  if (_config.get("segments", Json::Value("")).get("joining", Json::Value("")).size() == 0) {
597  cout << "ERROR: no gene segments file in the model's .json." << endl;
598 
599  return false;
600  } else {
601  cout << "OK" << endl;
602  }
603  string j_path = _model_path + _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("file", "").asString();
604 
605  bool vok, jok, dok = true;
606 
607  cout << "\tD gene seg.: ";
608  if (_config.get("segments", Json::Value("")).get("diversity", Json::Value("")).size() == 0) {
609  cout << "ERROR: no gene segments file in the model's .json." << endl;
610 
611  return false;
612  } else {
613  string d_path = _model_path + _config.get("segments", Json::Value("")).get("diversity", Json::Value("")).get("file", "").asString();
614  _genes.reset(new VDJRecombinationGenes("VDJ.V", v_path, "VDJ.J", j_path, "VDJ.D", d_path, &vok, &jok, &dok));
615  _min_D_len = _config.get("segments", Json::Value("")).get("diversity", Json::Value("")).get("min.len", DEFAULT_DIV_GENE_MIN_LEN).asUInt();
616  cout << "OK" << endl;
617  }
618 
619  if (vok && jok && dok) {
620  _genes->appendPalindromicNucleotides(VARIABLE,
621  _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("P.nuc.3'", 0).asUInt(),
622  _config.get("segments", Json::Value("")).get("variable", Json::Value("")).get("P.nuc.5'", 0).asUInt());
623  _genes->appendPalindromicNucleotides(JOINING,
624  _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("P.nuc.3'", 0).asUInt(),
625  _config.get("segments", Json::Value("")).get("joining", Json::Value("")).get("P.nuc.5'", 0).asUInt());
626  if (_genes->is_vdj()) {
627  _genes->appendPalindromicNucleotides(DIVERSITY,
628  _config.get("segments", Json::Value("")).get("diversity", Json::Value("")).get("P.nuc.3'", 0).asUInt(),
629  _config.get("segments", Json::Value("")).get("diversity", Json::Value("")).get("P.nuc.5'", 0).asUInt());
630  }
631  }
632 
633  return vok && jok && dok;
634  }
635 
636  protected:
637 
638  virtual void createContainers(vector<AbstractTDContainer*> &containers) {
639  AbstractTDContainer* container;
640 
641  // V
642  container = new TDVector(true, _config.get("probtables", Json::Value()).get("v", Json::Value()).get("laplace", .0).asDouble());
643  container->addDataVector(vector<prob_t>());
644  for (auto i = 1; i <= _genes->V().max(); ++i) {
645  container->addRowName(_genes->V()[i].allele);
646  container->addDataValue(1);
647  }
648  containers[VDJ_VAR_GEN] = container;
649  cout << "\tV genes prob.: " << "CREATED" << endl;
650 
651  // J-D
652  container = new TDMatrix(true, _config.get("probtables", Json::Value()).get("j.d", Json::Value()).get("laplace", .0).asDouble());
653  for (auto i = 1; i <= _genes->J().max(); ++i) {
654  container->addRowName(_genes->J()[i].allele);
655  }
656  for (auto i = 1; i <= _genes->D().max(); ++i) {
657  container->addColumnName(_genes->D()[i].allele);
658  }
659  container->addDataVector(vector<prob_t>(_genes->D().max() * _genes->J().max()));
660  containers[VDJ_JOI_DIV_GEN] = container;
661  cout << "\tJ-D gene pairs: " << "CREATED" << endl;
662 
663  // V del
664  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("v.del", Json::Value()).get("laplace", .0).asDouble());
665  for (auto i = 1; i <= _genes->V().max(); ++i) {
666  container->addColumnName(_genes->V()[i].allele);
667  container->addDataVector(vector<prob_t>(_genes->V()[i].sequence.size() + 1));
668  }
669  containers[VDJ_VAR_DEL] = container;
670  cout << "\tV delet. num.: " << "CREATED" << endl;;
671 
672  // J del
673  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("j.del", Json::Value()).get("laplace", .0).asDouble());
674  for (auto i = 1; i <= _genes->J().max(); ++i) {
675  container->addColumnName(_genes->J()[i].allele);
676  container->addDataVector(vector<prob_t>(_genes->J()[i].sequence.size() + 1));
677  }
678  containers[VDJ_JOI_DEL] = container;
679  cout << "\tJ delet. num.: " << "CREATED" << endl;
680 
681  // D del
682  container = new TDMatrixList(true, _config.get("probtables", Json::Value()).get("d.del", Json::Value()).get("laplace", .0).asDouble());
683  for (auto i = 1; i <= _genes->D().max(); ++i) {
684  container->addColumnName(_genes->D()[i].allele);
685  container->addDataVector(vector<prob_t>( (_genes->D()[i].sequence.size() + 1) * (_genes->D()[i].sequence.size() + 1) ));
686  container->addRowName(_genes->D()[i].allele);
687  container->addMetadata(_genes->D()[i].sequence.size() + 1);
688  }
689  containers[VDJ_DIV_DEL] = container;
690  cout << "\tD delet. num.: " << "CREATED" << endl;
691 
692  // VD ins + DJ ins
693  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("laplace", .0).asDouble());
694  container->addColumnName("VD ins");
695  container->addColumnName("DJ ins");
696  container->addDataVector(vector<prob_t>(_config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("max.len", DEFAULT_MAX_INS_LENGTH).asUInt64() + 1));
697  container->addDataVector(vector<prob_t>(_config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("max.len", DEFAULT_MAX_INS_LENGTH).asUInt64() + 1));
698  containers[VDJ_VAR_DIV_INS_LEN] = container;
699  cout << "\tVD/DJ ins. len.: " << "CREATED" << endl;
700 
701  // VD nuc + DJ nuc
702  container = new TDVectorList(true, _config.get("probtables", Json::Value()).get("ins.nucl", Json::Value()).get("laplace", .0).asDouble());
703  for (auto i = 0; i < 8; ++i) {
704  container->addColumnName("VD/DJ nucs");
705  container->addDataVector(vector<prob_t>(4));
706  }
707  containers[VDJ_VAR_DIV_INS_NUC] = container;
708  cout << "\tVD/DJ ins. nuc.: " << "CREATED" << endl;
709  }
710 
711 
712  virtual void parseDataContainer(const string &element, AbstractTDContainer *container, vector<AbstractTDContainer*> &containers) {
713  std::string err_message = "NOT FOUND";
714  if (element == "v") {
715  if (container && container->file_exists()) {
716  containers[VDJ_VAR_GEN] = container;
717  err_message = "OK";
718  }
719  cout << "\tV genes prob.: " << err_message << endl;
720  }
721  else if (element == "j.d") {
722  if (container
723  && container->file_exists()
724  && this->findGenes(container->column_names(), _genes->D(), err_message)
725  && this->findGenes(container->row_names(), _genes->J(), err_message))
726  {
727  containers[VDJ_JOI_DIV_GEN] = container;
728  err_message = "OK";
729  }
730 
731  cout << "\tJ-D gene pairs: " << err_message << endl;
732  }
733  else if (element == "v.del") {
734  if (container
735  && container->file_exists()
736  && this->findGenes(container->column_names(), _genes->V(), err_message))
737  {
738  containers[VDJ_VAR_DEL] = container;
739  err_message = "OK";
740  }
741 
742  cout << "\tV delet. num.: " << err_message << endl;;
743  }
744  else if (element == "j.del") {
745  if (container
746  && container->file_exists()
747  && this->findGenes(container->column_names(), _genes->J(), err_message))
748  {
749  containers[VDJ_JOI_DEL] = container;
750  err_message = "OK";
751  }
752 
753  cout << "\tJ delet. num.: " << err_message << endl;
754  }
755  else if (element == "d.del") {
756  if (container && container->file_exists()) {
757  containers[VDJ_DIV_DEL] = container;
758  err_message = "OK";
759  }
760 
761  cout << "\tD delet. num.: " << err_message << endl;
762  }
763  else if (element == "ins.len") {
764  if (container && container->file_exists()) {
765  if (container->n_columns() != 2) {
766  stringstream ss;
767  ss << "ERROR: wrong number of columns (expected: 2, got: " << (int) container->n_columns() << ")";
768  err_message = ss.str();
769  } else {
770  containers[VDJ_VAR_DIV_INS_LEN] = container;
771  err_message = "OK";
772  }
773  }
774 
775  cout << "\tVD/DJ ins. len.: " << err_message << endl;
776  }
777  else if (element == "ins.nucl") {
778  if (container && container->file_exists()) {
779  if (container->n_rows() != 4 || container->n_columns() != 8) {
780  stringstream ss;
781  ss << "ERROR: wrong number of columns and rows (expected: 4 X 8, got: " << (int) container->n_rows() << " X " << (int) container->n_columns() << ")";
782  err_message = ss.str();
783  } else {
784  containers[VDJ_VAR_DIV_INS_NUC] = container;
785  err_message = "OK";
786  }
787  }
788 
789  cout << "\tVD/DJ ins. nuc.: " << err_message << endl;
790  }
791  else { std::cout << "Unrecognised element in \'probtables\'" << ":\n\t" << element << std::endl; }
792  }
793 
794 
795  virtual bool makeModelParameterVector(vector<AbstractTDContainer*> &containers,
796  vector<prob_t> &event_probs,
797  vector<event_ind_t> &event_lengths,
798  vector<event_ind_t> &event_classes,
799  vector<seq_len_t> &event_col_num,
800  vector<prob_t> &laplace,
801  vector<seq_len_t> &min_D_len_vec)
802  {
803  bool is_ok = false;
804 
805  if (containers[VDJ_VAR_GEN]
806  && containers[VDJ_JOI_DIV_GEN]
807  && containers[VDJ_VAR_DEL]
808  && containers[VDJ_JOI_DEL]
809  && containers[VDJ_DIV_DEL]
810  && containers[VDJ_VAR_DIV_INS_LEN]
811  && containers[VDJ_VAR_DIV_INS_NUC]) {
812 
813  this->addGenes(containers[VDJ_VAR_GEN],
814  _genes->V(),
815  event_probs,
816  event_lengths,
817  event_classes,
818  event_col_num,
819  laplace);
820 
821  this->addGenes(containers[VDJ_JOI_DIV_GEN],
822  _genes->J(),
823  _genes->D(),
824  event_probs,
825  event_lengths,
826  event_classes,
827  event_col_num,
828  laplace,
829  1);
830 
831  this->addDels(containers[VDJ_VAR_DEL],
832  _genes->V(),
833  event_probs,
834  event_lengths,
835  event_classes,
836  event_col_num,
837  laplace,
838  1);
839 
840  this->addDels(containers[VDJ_JOI_DEL],
841  _genes->J(),
842  event_probs,
843  event_lengths,
844  event_classes,
845  event_col_num,
846  laplace,
847  containers[VDJ_VAR_DEL]->n_columns());
848 
849  this->addDels2D(containers[VDJ_DIV_DEL],
850  _genes->D(),
851  event_probs,
852  event_lengths,
853  event_classes,
854  event_col_num,
855  laplace,
856  containers[VDJ_JOI_DEL]->n_columns());
857 
858  this->addIns(containers[VDJ_VAR_DIV_INS_LEN],
859  event_probs,
860  event_lengths,
861  event_classes,
862  event_col_num,
863  laplace,
864  containers[VDJ_DIV_DEL]->n_rows(),
865  _config.get("probtables", Json::Value()).get("ins.len", Json::Value()).get("max.len", DEFAULT_MAX_INS_LENGTH).asUInt64() + 1);
866 
867  this->addIns(containers[VDJ_VAR_DIV_INS_NUC],
868  event_probs,
869  event_lengths,
870  event_classes,
871  event_col_num,
872  laplace,
873  1);
874 
875  for (seg_index_t i = 1; i <= _genes->D().max(); ++i) { min_D_len_vec.push_back(_min_D_len); }
876  _param_vec.reset(new ModelParameterVector(VDJ_RECOMB, event_probs, event_lengths, event_classes, event_col_num, laplace, _config.get("errors", 0).asDouble(), true, min_D_len_vec));
877  is_ok = true;
878  }
879 
880  return is_ok;
881  }
882 
883  };
884 
885 
886 // class VD2JModelParser : public ModelParser {
887 //
888 // };
889 }
890 
891 
892 #endif //YMIR_MODEL_PARSER_H
Definition: aligner.h:37
Definition: model_parser.h:15
Definition: textdata.h:540
Vector of gene segments.
Definition: textdata.h:138
Definition: genesegment.h:265
Definition: model_parser.h:574
Definition: textdata.h:28
Definition: textdata.h:401
Definition: model_parser.h:343
Class for storing parameters of assembling statistical model. Note: event with index 0 (zero) is "nul...
Definition: modelparametervector.h:68
List of std::vectors for deletions and insertions.
Definition: textdata.h:258
Definition: genesegment.h:44