33 #include "clonotype_builder.h" 35 #include "repertoire.h" 67 "Parsing complete. Parsed " << (size_t) count_all <<
" lines, found clonotypes with:" << std::endl <<
68 "\tunrecognised V segments:\t" << (
size_t) bad_V_seg << std::endl <<
69 "\tunrecognised D segments:\t" << (size_t) bad_D_seg << std::endl <<
70 "\tunrecognised J segments:\t" << (
size_t) bad_J_seg << std::endl <<
71 "\tunaligned V segments:\t" << (size_t) no_V_algn << std::endl <<
72 "\tunaligned D segments:\t" << (
size_t) no_D_algn << std::endl <<
73 "\tunaligned J segments:\t" << (size_t) no_J_algn << std::endl <<
74 "\tbad alignment length of V segments:\t" << (
size_t) bad_V_len <<
" (repaired)" << std::endl <<
75 "\tbad alignment length of J segments:\t" << (size_t) bad_J_len <<
" (repaired)" << std::endl <<
76 "Resulting cloneset size: " << (
size_t) count_all << std::endl;
80 template <GeneSegments GENE>
81 void update_bad_seg();
84 template <GeneSegments GENE>
85 void update_no_algn();
88 template <GeneSegments GENE>
89 void update_bad_len();
92 size_t count_all, bad_V_seg, bad_D_seg, bad_J_seg, no_V_algn, no_D_algn, no_J_algn, bad_V_len, bad_J_len;
118 Action align_V, align_J, align_D;
124 : align_V(V), align_J(J)
130 : align_V(V), align_D(D), align_J(J)
136 this->align_V = action;
141 this->align_J = action;
146 this->align_D = action;
159 template <
typename Aligner >
191 bool open(
const std::string &filepath,
193 SequenceType seq_type,
194 Recombination recomb,
197 if (_stream.is_open()) { _stream.close(); }
203 if (recomb == UNDEF_RECOMB) {
204 std::cout <<
"Repertoire parser error:" <<
"\tno recombination type for [" << filepath <<
"]" << endl;
208 _stream.open(filepath);
209 if (_stream.good()) {
210 std::cout <<
"Open [" << filepath <<
"] for reading" << endl;
211 _genes = gene_segments;
212 std::cout <<
"-- gene segments assigned to the parser" << endl;
213 _seq_type = seq_type;
216 std::cout <<
"-- options assigned" << endl;
218 _aligner.set_genes(_genes);
219 std::cout <<
"-- gene segments assigned to the aligner" << endl;
220 _aligner.set_parameters(params);
221 std::cout <<
"-- parameters assigned to the aligner" << endl;
224 std::cout <<
"Repertoire parser error:" <<
"\tinput file [" << filepath <<
"] not found" << endl;
244 std::cout <<
"Something bad is happening - can\'t parse the input file. Perhaps you need to open it first with open()?" << endl;
248 ClonotypeVector clonevec;
249 clonevec.reserve(DEFAULT_REPERTOIRE_RESERVE_SIZE);
251 this->parseRepertoire(clonevec, max_clonotype_count);
252 cloneset->swap(clonevec);
258 bool openAndParse(
const std::string &filepath,
261 SequenceType seq_type,
262 Recombination recomb,
265 if (this->open(filepath, gene_segments, seq_type, recomb, opts, params)) {
266 this->parse(cloneset);
268 if (_stream.is_open()) { _stream.close(); }
277 std::ifstream _stream;
279 Recombination _recomb;
281 SequenceType _seq_type;
288 void parseRepertoire(ClonotypeVector& vec,
size_t max_clonotype_count)
290 char column_sep =
'\t',
297 bool do_align_V = _opts.align_V == AlignmentColumnOptions::OVERWRITE,
298 do_align_J = _opts.align_J == AlignmentColumnOptions::OVERWRITE,
299 do_align_D = _opts.align_D == AlignmentColumnOptions::OVERWRITE;
301 stringstream column_stream, symbol_stream, temp_stream;
302 string line, segment_word, sequence;
304 int clonotype_num = 0, line_num = _stats.count_all + 1;
306 bool align_ok =
true;
308 vector<seg_index_t> vseg, jseg, dseg;
311 _aligner.setSequenceType(_seq_type);
312 _aligner.setRecombination(_recomb);
316 getline(_stream, line);
317 _read_header =
false;
319 while (!_stream.eof() && clonotype_num < max_clonotype_count) {
321 getline(_stream, line);
323 if (line.size() > 2) {
329 column_stream.clear();
330 column_stream.str(line);
335 if (_seq_type == NUCLEOTIDE) {
336 getline(column_stream, sequence, column_sep);
337 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
339 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
340 getline(column_stream, sequence, column_sep);
342 _aligner.setSequence(sequence);
348 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
350 getline(column_stream, segment_word, column_sep);
351 this->parseSegment<VARIABLE>(symbol_stream, segment_word, vseg, _genes.V(), line_num, segment_sep, temp_str);
357 if (_recomb == VJ_RECOMB) {
358 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
361 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
363 getline(column_stream, segment_word, column_sep);
364 this->parseSegment<DIVERSITY>(symbol_stream, segment_word, dseg, _genes.D(), line_num, segment_sep, temp_str);
372 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
374 getline(column_stream, segment_word, column_sep);
375 this->parseSegment<JOINING>(symbol_stream, segment_word, jseg, _genes.J(), line_num, segment_sep, temp_str);
382 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
383 if (!_aligner.alignVar()) {
384 _stats.update_no_algn<VARIABLE>();
387 getline(column_stream, segment_word, column_sep);
388 this->parseAlignment(symbol_stream, segment_word, vseg, VARIABLE, _genes.V(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
394 if (_recomb == VJ_RECOMB) {
395 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
398 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
399 if (!_aligner.alignDiv()) {
400 _stats.update_no_algn<DIVERSITY>();
403 getline(column_stream, segment_word, column_sep);
404 this->parseAlignment(symbol_stream, segment_word, dseg, DIVERSITY, _genes.D(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
412 column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
413 if (!_aligner.alignJoi()) {
414 _stats.update_no_algn<JOINING>();
417 getline(column_stream, segment_word, column_sep);
418 this->parseAlignment(symbol_stream, segment_word, jseg, JOINING, _genes.J(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
423 if (_stats.count_all % 50000 == 0) {
424 cout <<
"Parsed " << (size_t) _stats.count_all <<
" lines" << endl;
430 vec.push_back(_aligner.buildClonotype());
436 template <GeneSegments GENE>
437 void parseSegment(stringstream &symbol_stream,
438 const string &segment_word,
439 vector<seg_index_t> &segvec,
445 symbol_stream.clear();
446 symbol_stream.str(segment_word);
448 while (!symbol_stream.eof()) {
449 getline(symbol_stream, temp_str, segment_sep);
450 if (gsa[temp_str].index != 0) {
451 segvec.push_back(gsa[temp_str].index);
453 _stats.update_bad_seg<GENE>();
459 void parseAlignment(stringstream &symbol_stream,
460 const string &segment_word,
461 const vector<seg_index_t> &segvec,
468 stringstream &temp_stream)
470 symbol_stream.clear();
471 symbol_stream.str(segment_word);
473 int gene_start, seq_start, alignment_len;
474 seg_index_t seg_order = 0;
476 while (!symbol_stream.eof()) {
477 getline(symbol_stream, temp_str, segment_sep);
480 temp_stream.str(temp_str);
482 getline(temp_stream, temp_str, internal_sep);
483 gene_start = std::atoi(temp_str.c_str());
485 getline(temp_stream, temp_str, internal_sep);
486 seq_start = std::atoi(temp_str.c_str());
488 getline(temp_stream, temp_str, internal_sep);
489 alignment_len = std::atoi(temp_str.c_str());
491 if (alignment_len > gsa[segvec[seg_order]].sequence.size()) {
492 alignment_len = gsa[segvec[seg_order]].sequence.size();
493 if (gene == VARIABLE) {
495 }
else if (gene == JOINING) {
503 if (gene_start == 0) {
504 gene_start = gsa[segvec[seg_order]].sequence.size() - alignment_len + 1;
507 _aligner.addAlignment(gene, segvec[seg_order], gene_start, seq_start, alignment_len);
514 std::string get_prefix(
const string &filename)
const {
515 return "[" + filename +
"]: ";
522 void RepertoireParserStatistics::update_bad_seg<VARIABLE>() { ++bad_V_seg; }
526 void RepertoireParserStatistics::update_bad_seg<DIVERSITY>() { ++bad_D_seg; }
530 void RepertoireParserStatistics::update_bad_seg<JOINING>() { ++bad_J_seg; }
534 void RepertoireParserStatistics::update_no_algn<VARIABLE>() { ++no_V_algn; }
538 void RepertoireParserStatistics::update_no_algn<DIVERSITY>() { ++no_D_algn; }
542 void RepertoireParserStatistics::update_no_algn<JOINING>() { ++no_J_algn; }
Json::Value ParserConfig
Parameters of this parser: parser's name, names of the columns, separators, sequences and other...
Definition: parser.h:169
Parser for text files with repertoire data. By default it's MiTCR parser. To make new parsers inherit...
Definition: parser.h:160
Definition: aligner_parameters.h:129
Definition: genesegment.h:265
bool open(const std::string &filepath, const VDJRecombinationGenes &gene_segments, SequenceType seq_type, Recombination recomb, AlignmentColumnOptions opts=AlignmentColumnOptions().setV(AlignmentColumnOptions::USE_PROVIDED).setJ(AlignmentColumnOptions::USE_PROVIDED).setD(AlignmentColumnOptions::OVERWRITE), VDJAlignerParameters params=VDJAlignerParameters())
Parse text file with sequence alignment data and return constructed Cloneset.
Definition: parser.h:191
bool parse(Cloneset *cloneset, size_t max_clonotype_count=(size_t)-1)
Definition: parser.h:234
Definition: repertoire.h:171
Definition: genesegment.h:44