33 #include "clonotype_builder.h"    35 #include "repertoire.h"    67                     "Parsing complete. Parsed " << (size_t) count_all << 
" lines, found clonotypes with:" << std::endl <<
    68                     "\tunrecognised V segments:\t" << (
size_t) bad_V_seg << std::endl <<
    69                     "\tunrecognised D segments:\t" << (size_t) bad_D_seg << std::endl <<
    70                     "\tunrecognised J segments:\t" << (
size_t) bad_J_seg << std::endl <<
    71                     "\tunaligned V segments:\t" << (size_t) no_V_algn << std::endl <<
    72                     "\tunaligned D segments:\t" << (
size_t) no_D_algn << std::endl <<
    73                     "\tunaligned J segments:\t" << (size_t) no_J_algn << std::endl <<
    74                     "\tbad alignment length of V segments:\t" << (
size_t) bad_V_len << 
" (repaired)" << std::endl <<
    75                     "\tbad alignment length of J segments:\t" << (size_t) bad_J_len << 
" (repaired)" << std::endl <<
    76                     "Resulting cloneset size: " << (
size_t) count_all << std::endl;
    80         template <GeneSegments GENE>
    81         void update_bad_seg();
    84         template <GeneSegments GENE>
    85         void update_no_algn();
    88         template <GeneSegments GENE>
    89         void update_bad_len();
    92         size_t count_all, bad_V_seg, bad_D_seg, bad_J_seg, no_V_algn, no_D_algn, no_J_algn, bad_V_len, bad_J_len;
   118         Action align_V, align_J, align_D;
   124             : align_V(V), align_J(J)
   130             : align_V(V), align_D(D), align_J(J)
   136             this->align_V = action;
   141             this->align_J = action;
   146             this->align_D = action;
   159     template <
typename Aligner >
   191         bool open(
const std::string &filepath,
   193                   SequenceType seq_type,
   194                   Recombination recomb,
   197             if (_stream.is_open()) { _stream.close(); }
   203             if (recomb == UNDEF_RECOMB) {
   204                 std::cout << 
"Repertoire parser error:" << 
"\tno recombination type for [" << filepath << 
"]" << endl;
   208             _stream.open(filepath);
   209             if (_stream.good()) {
   210                 std::cout << 
"Open [" << filepath << 
"] for reading" << endl;
   211                 _genes = gene_segments;
   212                 std::cout << 
"-- gene segments assigned to the parser" << endl;
   213                 _seq_type = seq_type;
   216                 std::cout << 
"-- options assigned" << endl;
   218                 _aligner.set_genes(_genes);
   219                 std::cout << 
"-- gene segments assigned to the aligner" << endl;
   220                 _aligner.set_parameters(params);
   221                 std::cout << 
"-- parameters assigned to the aligner" << endl;
   224                 std::cout << 
"Repertoire parser error:" << 
"\tinput file [" << filepath << 
"] not found" << endl;
   244                 std::cout << 
"Something bad is happening - can\'t parse the input file. Perhaps you need to open it first with open()?" << endl;
   248             ClonotypeVector clonevec;
   249             clonevec.reserve(DEFAULT_REPERTOIRE_RESERVE_SIZE);
   251             this->parseRepertoire(clonevec, max_clonotype_count);
   252             cloneset->swap(clonevec);
   258         bool openAndParse(
const std::string &filepath,
   261                           SequenceType seq_type,
   262                           Recombination recomb,
   265             if (this->open(filepath, gene_segments, seq_type, recomb, opts, params)) {
   266                 this->parse(cloneset);
   268                 if (_stream.is_open()) { _stream.close(); }
   277         std::ifstream _stream;
   279         Recombination _recomb;
   281         SequenceType _seq_type;
   288         void parseRepertoire(ClonotypeVector& vec, 
size_t max_clonotype_count)
   290             char column_sep =
'\t',
   297             bool do_align_V = _opts.align_V == AlignmentColumnOptions::OVERWRITE,
   298                  do_align_J = _opts.align_J == AlignmentColumnOptions::OVERWRITE,
   299                  do_align_D = _opts.align_D == AlignmentColumnOptions::OVERWRITE;
   301             stringstream column_stream, symbol_stream, temp_stream;
   302             string line, segment_word, sequence;
   304             int clonotype_num = 0, line_num = _stats.count_all + 1;
   306             bool align_ok = 
true;
   308             vector<seg_index_t> vseg, jseg, dseg;
   311             _aligner.setSequenceType(_seq_type);
   312             _aligner.setRecombination(_recomb);
   316                 getline(_stream, line);
   317                 _read_header = 
false;
   319             while (!_stream.eof() && clonotype_num < max_clonotype_count) {
   321                 getline(_stream, line);
   323                 if (line.size() > 2) {
   329                     column_stream.clear();
   330                     column_stream.str(line);
   335                     if (_seq_type == NUCLEOTIDE) {
   336                         getline(column_stream, sequence, column_sep);
   337                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   339                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   340                         getline(column_stream, sequence, column_sep);
   342                     _aligner.setSequence(sequence);
   348                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   350                         getline(column_stream, segment_word, column_sep);
   351                         this->parseSegment<VARIABLE>(symbol_stream, segment_word, vseg, _genes.V(), line_num, segment_sep, temp_str);
   357                     if (_recomb == VJ_RECOMB) {
   358                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   361                             column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   363                             getline(column_stream, segment_word, column_sep);
   364                             this->parseSegment<DIVERSITY>(symbol_stream, segment_word, dseg, _genes.D(), line_num, segment_sep, temp_str);
   372                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   374                         getline(column_stream, segment_word, column_sep);
   375                         this->parseSegment<JOINING>(symbol_stream, segment_word, jseg, _genes.J(), line_num, segment_sep, temp_str);
   382                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   383                         if (!_aligner.alignVar()) {
   384                             _stats.update_no_algn<VARIABLE>();
   387                         getline(column_stream, segment_word, column_sep);
   388                         this->parseAlignment(symbol_stream, segment_word, vseg, VARIABLE, _genes.V(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
   394                     if (_recomb == VJ_RECOMB) {
   395                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   398                             column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   399                             if (!_aligner.alignDiv()) {
   400                                 _stats.update_no_algn<DIVERSITY>();
   403                             getline(column_stream, segment_word, column_sep);
   404                             this->parseAlignment(symbol_stream, segment_word, dseg, DIVERSITY, _genes.D(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
   412                         column_stream.ignore(numeric_limits<streamsize>::max(), column_sep);
   413                         if (!_aligner.alignJoi()) {
   414                             _stats.update_no_algn<JOINING>();
   417                         getline(column_stream, segment_word, column_sep);
   418                         this->parseAlignment(symbol_stream, segment_word, jseg, JOINING, _genes.J(), line_num, segment_sep, internal_sep, temp_str, temp_stream);
   423                     if (_stats.count_all % 50000 == 0) {
   424                         cout << 
"Parsed " << (size_t) _stats.count_all << 
" lines" << endl;
   430                     vec.push_back(_aligner.buildClonotype());
   436         template <GeneSegments GENE>
   437         void parseSegment(stringstream &symbol_stream,
   438                           const string &segment_word,
   439                           vector<seg_index_t> &segvec,
   445             symbol_stream.clear();
   446             symbol_stream.str(segment_word);
   448             while (!symbol_stream.eof()) {
   449                 getline(symbol_stream, temp_str, segment_sep);
   450                 if (gsa[temp_str].index != 0) {
   451                     segvec.push_back(gsa[temp_str].index);
   453                     _stats.update_bad_seg<GENE>();
   459         void parseAlignment(stringstream &symbol_stream,
   460                             const string &segment_word,
   461                             const vector<seg_index_t> &segvec,
   468                             stringstream &temp_stream)
   470             symbol_stream.clear();
   471             symbol_stream.str(segment_word);
   473             int gene_start, seq_start, alignment_len;
   474             seg_index_t seg_order = 0;
   476             while (!symbol_stream.eof()) {
   477                 getline(symbol_stream, temp_str, segment_sep);
   480                 temp_stream.str(temp_str);
   482                 getline(temp_stream, temp_str, internal_sep);
   483                 gene_start = std::atoi(temp_str.c_str());
   485                 getline(temp_stream, temp_str, internal_sep);
   486                 seq_start = std::atoi(temp_str.c_str());
   488                 getline(temp_stream, temp_str, internal_sep);
   489                 alignment_len = std::atoi(temp_str.c_str());
   491                 if (alignment_len > gsa[segvec[seg_order]].sequence.size()) {
   492                     alignment_len = gsa[segvec[seg_order]].sequence.size();
   493                     if (gene == VARIABLE) {
   495                     } 
else if (gene == JOINING) {
   503                 if (gene_start == 0) {
   504                     gene_start = gsa[segvec[seg_order]].sequence.size() - alignment_len + 1;
   507                 _aligner.addAlignment(gene, segvec[seg_order], gene_start, seq_start, alignment_len);
   514         std::string get_prefix(
const string &filename)
 const {
   515             return "[" + filename + 
"]: ";
   522     void RepertoireParserStatistics::update_bad_seg<VARIABLE>() { ++bad_V_seg; }
   526     void RepertoireParserStatistics::update_bad_seg<DIVERSITY>() { ++bad_D_seg; }
   530     void RepertoireParserStatistics::update_bad_seg<JOINING>() { ++bad_J_seg; }
   534     void RepertoireParserStatistics::update_no_algn<VARIABLE>() { ++no_V_algn; }
   538     void RepertoireParserStatistics::update_no_algn<DIVERSITY>() { ++no_D_algn; }
   542     void RepertoireParserStatistics::update_no_algn<JOINING>() { ++no_J_algn; }
 Json::Value ParserConfig
Parameters of this parser: parser's name, names of the columns, separators, sequences and other...
Definition: parser.h:169
 
Parser for text files with repertoire data. By default it's MiTCR parser. To make new parsers inherit...
Definition: parser.h:160
 
Definition: aligner_parameters.h:129
 
Definition: genesegment.h:265
 
bool open(const std::string &filepath, const VDJRecombinationGenes &gene_segments, SequenceType seq_type, Recombination recomb, AlignmentColumnOptions opts=AlignmentColumnOptions().setV(AlignmentColumnOptions::USE_PROVIDED).setJ(AlignmentColumnOptions::USE_PROVIDED).setD(AlignmentColumnOptions::OVERWRITE), VDJAlignerParameters params=VDJAlignerParameters())
Parse text file with sequence alignment data and return constructed Cloneset. 
Definition: parser.h:191
 
bool parse(Cloneset *cloneset, size_t max_clonotype_count=(size_t)-1)
Definition: parser.h:234
 
Definition: repertoire.h:171
 
Definition: genesegment.h:44