Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
genesegment.h
1 /*
2  * Ymir <imminfo.github.io/ymir>
3  *
4  * This file is part of Ymir, a fast C++ tool for computation of assembling
5  * probabilities, statistical inference of assembling statistical model
6  * and generation of artificial sequences of T-cell receptors data.
7  *
8  *
9  * Copyright 2015 Vadim Nazarov <vdn at mailbox dot com>
10  *
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  *
15  * http://www.apache.org/licenses/LICENSE-2.0
16  *
17  * Unless required by applicable law or agreed to in writing, software
18  * distributed under the License is distributed on an "AS IS" BASIS,
19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  * See the License for the specific language governing permissions and
21  * limitations under the License.
22  */
23 
24 #ifndef _GENESEGMENT_H
25 #define _GENESEGMENT_H
26 
27 
28 #include <fstream>
29 #include <sstream>
30 
31 #include "types.h"
32 #include "tools.h"
33 
34 
35 namespace ymir {
36 
37  class GeneSegmentAlphabet;
38  class VDJRecombinationGenes;
39 
40 
45  public:
46 
50  struct GeneSegment {
51  sequence_t allele;
52  sequence_t sequence;
53  std::string orig_sequence;
54  seg_index_t index;
55 
56  GeneSegment(const std::string& allele_,
57  const std::string& sequence_,
58  seg_index_t index_)
59  : allele(allele_), sequence(sequence_), orig_sequence(sequence_), index(index_) {
60  }
61  };
62 
63 
65  {
66  }
67 
68 
72  GeneSegmentAlphabet(GeneSegments gene_segment,
73  const std::string& name,
74  const std::string& filepath,
75  bool *is_ok = nullptr)
76  : _name(name)
77  {
78  bool res = this->read(filepath);
79  if (is_ok) {
80  *is_ok = res;
81  }
82  _gene_segment = gene_segment;
83  }
84 
85 
86  GeneSegmentAlphabet(GeneSegments gene_segment,
87  const std::string& name,
88  const std::vector<std::string>& alleles,
89  const std::vector<std::string>& sequences) {
90  this->_name = name;
91 
92  if (alleles.size() != sequences.size()) {
93  std::cout << "Gene segment alphabet [" << this->_name << "] warning:" << std::endl << "\talleles and sequences vectors have different sizes" << std::endl;
94  }
95 
96  size_t min_size = std::min(alleles.size(), sequences.size());
97  this->_vec.reserve(min_size);
98 
99  for (size_t i = 0; i < min_size; i++) {
100  this->addGeneSegment(alleles[i], sequences[i], i + 1);
101  }
102 
103  _gene_segment = gene_segment;
104  }
105 
106 
107  const std::string& name() const { return this->_name; }
108 
109 
110  const GeneSegment& operator[] (seg_index_t index) const {
111 #ifndef DNDEBUG
112  check_and_throw(index == 0 || index > _vec.size(),
113  "Gene segment alphabet [" + _name + "] error:\tindex " + std::to_string(index) + "is out of bounds");
114 #endif
115  return _vec[index - 1];
116  }
117 
118  const GeneSegment& operator[] (const std::string& name) const {
119 #ifndef DNDEBUG
120  check_and_throw(_map.find(name) == _map.end(),
121  "Gene segment alphabet [" + _name + "] error:\tcan't find allele " + name);
122 #endif
123  return _vec[_map.at(name)];
124  }
125 
126 
127  seg_index_t size() const { return this->_vec.size(); }
128 
129 
130  seg_index_t max() const { return this->_vec.size(); }
131 
132 
140  void appendPalindromicNucleotides(seq_len_t from_5_end, seq_len_t from_3_end) {
141  std::string seq_5_end = "", seq_3_end = "";
142  for (auto i = 0; i < _vec.size(); ++i) {
143  seq_5_end = "";
144  for (auto left_i = 0; left_i < from_5_end; ++ left_i) {
145  seq_5_end = complement(_vec[i].orig_sequence[left_i]) + seq_5_end;
146  }
147 
148  seq_3_end = "";
149  for (auto right_i = 0; right_i < from_3_end; ++right_i) {
150  seq_3_end = seq_3_end + complement(_vec[i].orig_sequence[_vec[i].orig_sequence.size() - 1 - right_i]);
151  }
152 
153  _vec[i].sequence.reserve(from_5_end + _vec[i].orig_sequence.size() + from_3_end + 2);
154  _vec[i].sequence = seq_5_end + _vec[i].orig_sequence + seq_3_end;
155  }
156  }
157 
158 
162  bool write(const std::string& filepath) const {
163  if (filepath.empty()) {
164  std::cout << "Gene segment alphabet [" << this->_name << "] error:" << std::endl << "\tinput file name is an empty string" << std::endl;
165  return false;
166  }
167 
168  std::ofstream ofs;
169  ofs.open(filepath);
170 
171  if (ofs.is_open()) {
172  ofs << this->_name << '\t' << "Sequences" << std::endl;
173  for (seg_index_t i = 0; i < this->_vec.size(); ++i) {
174  ofs << _vec[i].allele << '\t' << _vec[i].orig_sequence;
175  if (i != _vec.size() - 1) {
176  ofs << std::endl;
177  }
178  }
179  } else {
180  return false;
181  }
182 
183  ofs.close();
184  return true;
185  }
186 
187 
188  bool read(const std::string& filepath) {
189  if (filepath.empty()) {
190  std::cout << "Gene segment alphabet [" << this->_name << "] error:" << std::endl << "\tinput file name is an empty string" << std::endl;
191  return false;
192  }
193 
194  std::ifstream ifs;
195  ifs.open(filepath);
196 
197  if (ifs.is_open()) {
198  std::cout << "Gene segment alphabet [" << this->_name << "] reading input file [" << filepath << "]" << std::endl;
199  std::stringstream line_stream;
200  std::string line, gene_name, gene_seq;
201  bool header = false;
202  seg_index_t cur_index = 1;
203  while (!ifs.eof()) {
204  getline(ifs, line);
205  if (line[0] != '\n' && line.size() > 3) {
206  line_stream.str(line);
207  if (header) {
208  // gene segment name
209  getline(line_stream, gene_name, '\t');
210  // gene segment sequence
211  getline(line_stream, gene_seq, '\t');
212  this->addGeneSegment(gene_name, gene_seq, cur_index);
213  ++cur_index;
214  } else {
215  // skip header
216  header = true;
217  }
218  line_stream.clear();
219  }
220  }
221  } else {
222  std::cout << "Gene segment alphabet [" << this->_name << "] error:" << std::endl << "\tinput file [" << filepath << "] not found" << std::endl;
223  return false;
224  }
225 
226  ifs.close();
227  return true;
228  }
229 
230 
231  seq_len_t maxLength() const {
232  seq_len_t res = 0;
233  for (auto i = 1; i < _vec.size(); ++i) {
234  res = std::max(res, (seq_len_t) _vec[i].sequence.size());
235  }
236  return res;
237  }
238 
239 
240  GeneSegments gene_segment() const { return _gene_segment; }
241 
242  protected:
243 
244  std::string _name;
245  std::unordered_map<std::string, seg_index_t> _map;
246  std::vector<GeneSegment> _vec;
247  GeneSegments _gene_segment;
248 
249 
250  void addGeneSegment(const std::string& name, const std::string& seq, seg_index_t index) {
251  if (name.empty() || seq.empty()) {
252  std::cout << "Gene segment alphabet [" << this->_name << "] warning:" << std::endl << "\tempty string for name or sequence. Skipping." << std::endl;
253  } else {
254  this->_map[name] = index - 1;
255  this->_vec.push_back(GeneSegment(name, seq, index));
256  }
257  }
258 
259  };
260 
261 
266 
267  typedef unique_ptr<GeneSegmentAlphabet> GSA_ptr;
268 
269  public:
270 
271 
273  {
274  }
275 
276 
277  VDJRecombinationGenes(const std::string& v_segments_name, const std::string& v_segments_file,
278  const std::string& j_segments_name, const std::string& j_segments_file,
279  bool *is_V_ok = nullptr, bool *is_J_ok = nullptr)
280  : _V(new GeneSegmentAlphabet(VARIABLE, v_segments_name, v_segments_file, is_V_ok)),
281  _J(new GeneSegmentAlphabet(JOINING, j_segments_name, j_segments_file, is_J_ok))
282  {
283  std::cout << "Loaded " << std::to_string(_V->size()) << " V alleles." << std::endl;
284  std::cout << "Loaded " << std::to_string(_J->size()) << " J alleles." << std::endl;
285  }
286 
287 
288  VDJRecombinationGenes(const std::string& v_segments_name, const std::string& v_segments_file,
289  const std::string& j_segments_name, const std::string& j_segments_file,
290  const std::string& d_segments_name, const std::string& d_segments_file,
291  bool *is_V_ok = nullptr, bool *is_J_ok = nullptr, bool *is_D_ok = nullptr)
292  : _V(new GeneSegmentAlphabet(VARIABLE, v_segments_name, v_segments_file, is_V_ok)),
293  _J(new GeneSegmentAlphabet(JOINING, j_segments_name, j_segments_file, is_J_ok)),
294  _D(new GeneSegmentAlphabet(DIVERSITY, d_segments_name, d_segments_file, is_D_ok))
295  {
296  std::cout << "Loaded " << std::to_string(_V->size()) << " V alleles." << std::endl;
297  std::cout << "Loaded " << std::to_string(_D->size()) << " D alleles." << std::endl;
298  std::cout << "Loaded " << std::to_string(_J->size()) << " J alleles." << std::endl;
299  }
300 
301 
302  VDJRecombinationGenes(const std::string& V_name, const std::vector<std::string>& V_alleles, const std::vector<std::string>& V_sequences,
303  const std::string& J_name, const std::vector<std::string>& J_alleles, const std::vector<std::string>& J_sequences)
304  : _V(new GeneSegmentAlphabet(VARIABLE, V_name, V_alleles, V_sequences)),
305  _J(new GeneSegmentAlphabet(JOINING, J_name, J_alleles, J_sequences))
306  {
307  std::cout << "Loaded " << std::to_string(_V->size()) << " V alleles." << std::endl;
308  std::cout << "Loaded " << std::to_string(_J->size()) << " J alleles." << std::endl;
309  }
310 
311 
312  VDJRecombinationGenes(const std::string& V_name, const std::vector<std::string>& V_alleles, const std::vector<std::string>& V_sequences,
313  const std::string& J_name, const std::vector<std::string>& J_alleles, const std::vector<std::string>& J_sequences,
314  const std::string& D_name, const std::vector<std::string>& D_alleles, const std::vector<std::string>& D_sequences)
315  : _V(new GeneSegmentAlphabet(VARIABLE, V_name, V_alleles, V_sequences)),
316  _J(new GeneSegmentAlphabet(JOINING, J_name, J_alleles, J_sequences)),
317  _D(new GeneSegmentAlphabet(DIVERSITY, D_name, D_alleles, D_sequences))
318  {
319  std::cout << "Loaded " << std::to_string(_V->size()) << " V alleles." << std::endl;
320  std::cout << "Loaded " << std::to_string(_D->size()) << " D alleles." << std::endl;
321  std::cout << "Loaded " << std::to_string(_J->size()) << " J alleles." << std::endl;
322  }
323 
324 
325  VDJRecombinationGenes(const VDJRecombinationGenes &other)
326  : _V(new GeneSegmentAlphabet(other.V())),
327  _J(new GeneSegmentAlphabet(other.J()))
328  {
329  if (other._D) {
330  _D.reset(new GeneSegmentAlphabet(other.D()));
331  }
332  }
333 
334 
335  VDJRecombinationGenes& operator=(const VDJRecombinationGenes &other) {
336  _V.reset(new GeneSegmentAlphabet(other.V()));
337  _J.reset(new GeneSegmentAlphabet(other.J()));
338 
339  if (other._D) {
340  _D.reset(new GeneSegmentAlphabet(other.D()));
341  } else {
342  _D.release();
343  }
344 
345  return *this;
346  }
347 
348 
349  bool is_vdj() const { return (bool) _D; }
350 
351 
355  const GeneSegmentAlphabet& V() const {
357 #ifndef DNDEBUG
358  check_and_throw(!(bool) _V, "V pointer is null");
359 #endif
360  return *_V;
361  }
362 
363  const GeneSegmentAlphabet& J() const {
364 #ifndef DNDEBUG
365  check_and_throw(!(bool) _J, "J pointer is null");
366 #endif
367  return *_J;
368  }
369 
370  const GeneSegmentAlphabet& D() const {
371 #ifndef DNDEBUG
372  check_and_throw(!(bool) _D, "D pointer is null");
373 #endif
374  return *_D;
375  }
377 
378 
382  void appendPalindromicNucleotides(GeneSegments gene, seq_len_t from_5_end = 4, seq_len_t from_3_end = 4) {
383  if (gene == VARIABLE) {
384  this->_V->appendPalindromicNucleotides(from_5_end, from_3_end);
385  } else if (gene == JOINING) {
386  this->_J->appendPalindromicNucleotides(from_5_end, from_3_end);
387  } else if (gene == DIVERSITY) {
388  this->_D->appendPalindromicNucleotides(from_5_end, from_3_end);
389  }
390  }
391 
392 
393  bool write(const std::string& v_segments_file = "vsegments.txt",
394  const std::string& j_segments_file = "jsegments.txt",
395  const std::string& d_segments_file = "dsegments.txt") const
396  {
397  if (this->_V->write(v_segments_file)) {
398  if (this->_J->write(j_segments_file)) {
399  if (!this->is_vdj()) {
400  return true;
401  } else {
402  return this->_D->write(d_segments_file);
403  }
404  }
405  }
406  return false;
407  }
408 
409  protected:
410 
411  GSA_ptr _V, _J, _D;
412 
413  };
414 }
415 
416 #endif
Definition: aligner.h:37
Definition: genesegment.h:265
Definition: genesegment.h:50
void appendPalindromicNucleotides(seq_len_t from_5_end, seq_len_t from_3_end)
Append P(alindromic)-nucleotides to all gene segment sequences from 3&#39;end, 5&#39;end or both...
Definition: genesegment.h:140
Definition: genesegment.h:44
void appendPalindromicNucleotides(GeneSegments gene, seq_len_t from_5_end=4, seq_len_t from_3_end=4)
Definition: genesegment.h:382
bool write(const std::string &filepath) const
Save segment alphabet to the given file as a tab-separated table with 3 columns.
Definition: genesegment.h:162