Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
parser.h
1 /*
2  * Ymir <imminfo.github.io/ymir>
3  *
4  * This file is part of Ymir, a fast C++ tool for computation of assembling
5  * probabilities, statistical inference of assembling statistical model
6  * and generation of artificial sequences of T-cell receptors data.
7  *
8  *
9  * Copyright 2015 Vadim Nazarov <vdn at mailbox dot com>
10  *
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  *
15  * http://www.apache.org/licenses/LICENSE-2.0
16  *
17  * Unless required by applicable law or agreed to in writing, software
18  * distributed under the License is distributed on an "AS IS" BASIS,
19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  * See the License for the specific language governing permissions and
21  * limitations under the License.
22  */
23 
24 #ifndef _PARSER_H
25 #define _PARSER_H
26 
27 
28 #include "iostream"
29 #include "fstream"
30 #include "sstream"
31 
32 #include "aligner.h"
33 #include "clonotype_builder.h"
34 #include "errcorr.h"
35 #include "repertoire.h"
36 
37 
38 using std::getline;
39 
40 
41 namespace ymir {
42 
47 
49  {
50  this->reset();
51  }
52 
53  void reset() {
54  count_all = 0;
55  bad_V_seg = 0;
56  bad_D_seg = 0;
57  bad_J_seg = 0;
58  no_V_algn = 0;
59  no_D_algn = 0;
60  no_J_algn = 0;
61  bad_V_len = 0;
62  bad_J_len = 0;
63  }
64 
65  void print() {
66  std::cout <<
67  "Parsing complete. Parsed " << (size_t) count_all << " lines, found clonotypes with:" << std::endl <<
68  "\tunrecognised V segments:\t" << (size_t) bad_V_seg << std::endl <<
69  "\tunrecognised D segments:\t" << (size_t) bad_D_seg << std::endl <<
70  "\tunrecognised J segments:\t" << (size_t) bad_J_seg << std::endl <<
71  "\tunaligned V segments:\t" << (size_t) no_V_algn << std::endl <<
72  "\tunaligned D segments:\t" << (size_t) no_D_algn << std::endl <<
73  "\tunaligned J segments:\t" << (size_t) no_J_algn << std::endl <<
74  "\tbad alignment length of V segments:\t" << (size_t) bad_V_len << " (repaired)" << std::endl <<
75  "\tbad alignment length of J segments:\t" << (size_t) bad_J_len << " (repaired)" << std::endl <<
76  "Resulting cloneset size: " << (size_t) count_all << std::endl;
77  }
78 
79 
80  template <GeneSegments GENE>
81  void update_bad_seg();
82 
83 
84  template <GeneSegments GENE>
85  void update_no_algn();
86 
87 
88  template <GeneSegments GENE>
89  void update_bad_len();
90 
91 
92  size_t count_all, bad_V_seg, bad_D_seg, bad_J_seg, no_V_algn, no_D_algn, no_J_algn, bad_V_len, bad_J_len;
93 
94  };
95 
96 
97 // enum AlignmentColumnsAction {
98 // USE_PROVIDED,
99 // ALIGN_ONLY_D,
100 // ALIGN_ALL
101 // };
102 
103 
105 
113  enum Action {
114  OVERWRITE,
115  USE_PROVIDED
116  };
117 
118  Action align_V, align_J, align_D;
119 
121 
122 
123  AlignmentColumnOptions(Action V, Action J)
124  : align_V(V), align_J(J)
125  {
126  }
127 
128 
129  AlignmentColumnOptions(Action V, Action D, Action J)
130  : align_V(V), align_D(D), align_J(J)
131  {
132  }
133 
134 
135  AlignmentColumnOptions& setV(Action action) {
136  this->align_V = action;
137  return *this;
138  }
139 
140  AlignmentColumnOptions& setJ(Action action) {
141  this->align_J = action;
142  return *this;
143  }
144 
145  AlignmentColumnOptions& setD(Action action) {
146  this->align_D = action;
147  return *this;
148  }
149  };
150 
151 
159  template <typename Aligner >
161 
162  public:
163 
169  typedef Json::Value ParserConfig;
170 
171 
172  RepertoireParser() {
173 // _config_is_loaded = false;
174  }
175 
176 
191  bool open(const std::string &filepath,
192  const VDJRecombinationGenes &gene_segments,
193  SequenceType seq_type,
194  Recombination recomb,
195  AlignmentColumnOptions opts = AlignmentColumnOptions().setV(AlignmentColumnOptions::USE_PROVIDED).setJ(AlignmentColumnOptions::USE_PROVIDED).setD(AlignmentColumnOptions::OVERWRITE),
197  if (_stream.is_open()) { _stream.close(); }
198 
199  _status = false;
200  _read_header = true;
201  _stats.reset();
202 
203  if (recomb == UNDEF_RECOMB) {
204  std::cout << "Repertoire parser error:" << "\tno recombination type for [" << filepath << "]" << endl;
205  return false;
206  }
207 
208  _stream.open(filepath);
209  if (_stream.good()) {
210  std::cout << "Open [" << filepath << "] for reading" << endl;
211  _genes = gene_segments;
212  std::cout << "-- gene segments assigned to the parser" << endl;
213  _seq_type = seq_type;
214  _recomb = recomb;
215  _opts = opts;
216  std::cout << "-- options assigned" << endl;
217  _status = true;
218  _aligner.set_genes(_genes);
219  std::cout << "-- gene segments assigned to the aligner" << endl;
220  _aligner.set_parameters(params);
221  std::cout << "-- parameters assigned to the aligner" << endl;
222  return true;
223  } else {
224  std::cout << "Repertoire parser error:" << "\tinput file [" << filepath << "] not found" << endl;
225  }
226 
227  return false;
228  }
229 
230 
234  bool parse(Cloneset *cloneset, size_t max_clonotype_count = (size_t)-1) {
235  if (_stream.eof()) {
236  _stats.print();
237  _stats.reset();
238  _status = false;
239  _stream.close();
240  return false;
241  }
242 
243  if (!_status) {
244  std::cout << "Something bad is happening - can\'t parse the input file. Perhaps you need to open it first with open()?" << endl;
245  return false;
246  }
247 
248  ClonotypeVector clonevec;
249  clonevec.reserve(DEFAULT_REPERTOIRE_RESERVE_SIZE);
250 
251  this->parseRepertoire(clonevec, max_clonotype_count);
252  cloneset->swap(clonevec);
253 
254  return true;
255  }
256 
257 
258  bool openAndParse(const std::string &filepath,
259  Cloneset *cloneset,
260  const VDJRecombinationGenes &gene_segments,
261  SequenceType seq_type,
262  Recombination recomb,
263  AlignmentColumnOptions opts = AlignmentColumnOptions().setV(AlignmentColumnOptions::USE_PROVIDED).setJ(AlignmentColumnOptions::USE_PROVIDED).setD(AlignmentColumnOptions::OVERWRITE),
265  if (this->open(filepath, gene_segments, seq_type, recomb, opts, params)) {
266  this->parse(cloneset);
267  _stats.print();
268  if (_stream.is_open()) { _stream.close(); }
269  return true;
270  }
271 
272  return false;
273  }
274 
275  protected:
276 
277  std::ifstream _stream;
278  VDJRecombinationGenes _genes;
279  Recombination _recomb;
281  SequenceType _seq_type;
283  Aligner _aligner;
284  bool _status;
285  bool _read_header;
286 
287 
288  void parseRepertoire(ClonotypeVector& vec, size_t max_clonotype_count)
289  {
290  char column_sep ='\t',
291  segment_sep = ',',
292  internal_sep = '|',
293  alignment_sep = ';',
294  start_bracket = '(',
295  end_bracket = ')';
296 
297  bool do_align_V = _opts.align_V == AlignmentColumnOptions::OVERWRITE,
298  do_align_J = _opts.align_J == AlignmentColumnOptions::OVERWRITE,
299  do_align_D = _opts.align_D == AlignmentColumnOptions::OVERWRITE;
300 
301  stringstream column_stream, symbol_stream, temp_stream;
302  string line, segment_word, sequence;
303 
304  int clonotype_num = 0, line_num = _stats.count_all + 1;
305 
306  bool align_ok = true;
307 
308  vector<seg_index_t> vseg, jseg, dseg;
309  string temp_str;
310 
311  _aligner.setSequenceType(_seq_type);
312  _aligner.setRecombination(_recomb);
313 
314  // Skip header
315  if (_read_header) {
316  getline(_stream, line);
317  _read_header = false;
318  }
319  while (!_stream.eof() && clonotype_num < max_clonotype_count) {
320  // Start processing clonotypes
321  getline(_stream, line);
322 
323  if (line.size() > 2) {
324  // parse body and build clonotypes from each line
325  vseg.clear();
326  jseg.clear();
327  dseg.clear();
328 
329  column_stream.clear();
330  column_stream.str(line);
331 
332  //
333  // Get nucleotide or amino acid sequence
334  //
335  if (_seq_type == NUCLEOTIDE) {
336  getline(column_stream, sequence, column_sep);
337  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
338  } else {
339  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
340  getline(column_stream, sequence, column_sep);
341  }
342  _aligner.setSequence(sequence);
343 
344  //
345  // Parse Variable genes
346  //
347  if (do_align_V) {
348  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
349  } else {
350  getline(column_stream, segment_word, column_sep);
351  this->parseSegment<VARIABLE>(symbol_stream, segment_word, vseg, _genes.V(), line_num, segment_sep, temp_str);
352  }
353 
354  //
355  // Parse Diversity genes
356  //
357  if (_recomb == VJ_RECOMB) {
358  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
359  } else {
360  if (do_align_D) {
361  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
362  } else {
363  getline(column_stream, segment_word, column_sep);
364  this->parseSegment<DIVERSITY>(symbol_stream, segment_word, dseg, _genes.D(), line_num, segment_sep, temp_str);
365  }
366  }
367 
368  //
369  // Parse Joining genes
370  //
371  if (do_align_J) {
372  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
373  } else {
374  getline(column_stream, segment_word, column_sep);
375  this->parseSegment<JOINING>(symbol_stream, segment_word, jseg, _genes.J(), line_num, segment_sep, temp_str);
376  }
377 
378  //
379  // Parse Variable alignments
380  //
381  if (do_align_V) {
382  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
383  if (!_aligner.alignVar()) {
384  _stats.update_no_algn<VARIABLE>();
385  }
386  } else {
387  getline(column_stream, segment_word, column_sep);
388  this->parseAlignment(symbol_stream, segment_word, vseg, VARIABLE, _genes.V(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
389  }
390 
391  //
392  // Parse Diversity alignments
393  //
394  if (_recomb == VJ_RECOMB) {
395  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
396  } else {
397  if (do_align_D) {
398  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
399  if (!_aligner.alignDiv()) {
400  _stats.update_no_algn<DIVERSITY>();
401  }
402  } else {
403  getline(column_stream, segment_word, column_sep);
404  this->parseAlignment(symbol_stream, segment_word, dseg, DIVERSITY, _genes.D(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
405  }
406  }
407 
408  //
409  // Parse Joining alignments
410  //
411  if (do_align_J) {
412  column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
413  if (!_aligner.alignJoi()) {
414  _stats.update_no_algn<JOINING>();
415  }
416  } else {
417  getline(column_stream, segment_word, column_sep);
418  this->parseAlignment(symbol_stream, segment_word, jseg, JOINING, _genes.J(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
419  }
420 
421  ++clonotype_num;
422  ++_stats.count_all;
423  if (_stats.count_all % 50000 == 0) {
424  cout << "Parsed " << (size_t) _stats.count_all << " lines" << endl;
425  }
426 
427  //
428  // TODO: remove bad clonotypes here ???
429  //
430  vec.push_back(_aligner.buildClonotype());
431  }
432  }
433  }
434 
435 
436  template <GeneSegments GENE>
437  void parseSegment(stringstream &symbol_stream,
438  const string &segment_word,
439  vector<seg_index_t> &segvec,
440  const GeneSegmentAlphabet &gsa,
441  size_t line_num,
442  char segment_sep,
443  string &temp_str)
444  {
445  symbol_stream.clear();
446  symbol_stream.str(segment_word);
447 
448  while (!symbol_stream.eof()) {
449  getline(symbol_stream, temp_str, segment_sep);
450  if (gsa[temp_str].index != 0) {
451  segvec.push_back(gsa[temp_str].index);
452  } else {
453  _stats.update_bad_seg<GENE>();
454  }
455  }
456  }
457 
458 
459  void parseAlignment(stringstream &symbol_stream,
460  const string &segment_word,
461  const vector<seg_index_t> &segvec,
462  GeneSegments gene,
463  const GeneSegmentAlphabet &gsa,
464  size_t line_num,
465  char segment_sep,
466  char internal_sep,
467  string &temp_str,
468  stringstream &temp_stream)
469  {
470  symbol_stream.clear();
471  symbol_stream.str(segment_word);
472 
473  int gene_start, seq_start, alignment_len;
474  seg_index_t seg_order = 0;
475 
476  while (!symbol_stream.eof()) {
477  getline(symbol_stream, temp_str, segment_sep);
478 
479  temp_stream.clear();
480  temp_stream.str(temp_str);
481 
482  getline(temp_stream, temp_str, internal_sep);
483  gene_start = std::atoi(temp_str.c_str());
484 
485  getline(temp_stream, temp_str, internal_sep);
486  seq_start = std::atoi(temp_str.c_str());
487 
488  getline(temp_stream, temp_str, internal_sep);
489  alignment_len = std::atoi(temp_str.c_str());
490 
491  if (alignment_len > gsa[segvec[seg_order]].sequence.size()) {
492  alignment_len = gsa[segvec[seg_order]].sequence.size();
493  if (gene == VARIABLE) {
494  ++_stats.bad_V_len;
495  } else if (gene == JOINING) {
496  ++_stats.bad_J_len;
497  }
498  }
499 
500  // "0" in gene_start means that there is no information from what letter
501  // the J gene segment was started, so Ymir by default will compute it
502  // assuming that J segment is aligned at the very end of the input sequence.
503  if (gene_start == 0) {
504  gene_start = gsa[segvec[seg_order]].sequence.size() - alignment_len + 1;
505  }
506 
507  _aligner.addAlignment(gene, segvec[seg_order], gene_start, seq_start, alignment_len);
508 
509  ++seg_order;
510  }
511  }
512 
513 
514  std::string get_prefix(const string &filename) const {
515  return "[" + filename + "]: ";
516  }
517 
518  };
519 
520 
521  template <>
522  void RepertoireParserStatistics::update_bad_seg<VARIABLE>() { ++bad_V_seg; }
523 
524 
525  template <>
526  void RepertoireParserStatistics::update_bad_seg<DIVERSITY>() { ++bad_D_seg; }
527 
528 
529  template <>
530  void RepertoireParserStatistics::update_bad_seg<JOINING>() { ++bad_J_seg; }
531 
532 
533  template <>
534  void RepertoireParserStatistics::update_no_algn<VARIABLE>() { ++no_V_algn; }
535 
536 
537  template <>
538  void RepertoireParserStatistics::update_no_algn<DIVERSITY>() { ++no_D_algn; }
539 
540 
541  template <>
542  void RepertoireParserStatistics::update_no_algn<JOINING>() { ++no_J_algn; }
543 
544 //
545 // template <GeneSegments GENE>
546 // inline void add_alignment(Aligner &aligner, seg_index_t seg_index, seq_len_t genestart, seq_len_t seqstart, seq_len_t alignment_len);
547 //
548 //
549 // inline void add_alignment<Aligner, VARIABLE>(Aligner &aligner, seg_index_t seg_index, seq_len_t genestart, seq_len_t seqstart, seq_len_t alignment_len) {
550 // aligner.addVarAlignment(seg_index, genestart, seqstart, alignment_len);
551 // };
552 
553 
554 
556 
557 
559 
560 
562 
563 
565 
566 
567 // typedef RepertoireFileParser<NoGapAlignmentVector> RepertoireParser;
568 //
569 // typedef RepertoireFileParser<GappedAlignmentVector> ErrorCorrectingParser;
570 
571 }
572 
573 #endif
Json::Value ParserConfig
Parameters of this parser: parser&#39;s name, names of the columns, separators, sequences and other...
Definition: parser.h:169
Definition: parser.h:46
Definition: aligner.h:37
Parser for text files with repertoire data. By default it&#39;s MiTCR parser. To make new parsers inherit...
Definition: parser.h:160
Definition: aligner_parameters.h:129
Definition: genesegment.h:265
bool open(const std::string &filepath, const VDJRecombinationGenes &gene_segments, SequenceType seq_type, Recombination recomb, AlignmentColumnOptions opts=AlignmentColumnOptions().setV(AlignmentColumnOptions::USE_PROVIDED).setJ(AlignmentColumnOptions::USE_PROVIDED).setD(AlignmentColumnOptions::OVERWRITE), VDJAlignerParameters params=VDJAlignerParameters())
Parse text file with sequence alignment data and return constructed Cloneset.
Definition: parser.h:191
bool parse(Cloneset *cloneset, size_t max_clonotype_count=(size_t)-1)
Definition: parser.h:234
Definition: parser.h:104
Definition: repertoire.h:171
Definition: genesegment.h:44