Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
tools.h
1 /*
2  * Ymir <imminfo.github.io/ymir>
3  *
4  * This file is part of Ymir, a fast C++ tool for computation of assembling
5  * probabilities, statistical inference of assembling statistical model
6  * and generation of artificial sequences of T-cell receptors data.
7  *
8  *
9  * Copyright 2015 Vadim Nazarov <vdn at mailbox dot com>
10  *
11  * Licensed under the Apache License, Version 2.0 (the "License");
12  * you may not use this file except in compliance with the License.
13  * You may obtain a copy of the License at
14  *
15  * http://www.apache.org/licenses/LICENSE-2.0
16  *
17  * Unless required by applicable law or agreed to in writing, software
18  * distributed under the License is distributed on an "AS IS" BASIS,
19  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  * See the License for the specific language governing permissions and
21  * limitations under the License.
22  */
23 
24 #ifndef _TOOLS_H_
25 #define _TOOLS_H_
26 
27 
28 #include <string>
29 #include <vector>
30 #include "math.h"
31 
32 #include "types.h"
33 //#include "repertoire.h"
34 
35 
36 namespace ymir {
37 
38 
39  inline void check_and_throw(bool condition, const std::string &error_msg) {
40 #ifndef DNDEBUG
41  if (condition) {
42  throw(std::runtime_error(error_msg));
43  }
44 #endif
45  }
46 
47 
48  inline uint8_t nuc_hash(char nuc) {
49  switch (nuc) {
50  case 'A': return 0;
51  case 'C': return 1;
52  case 'G': return 2;
53  case 'T': return 3;
54  default : return 4;
55  }
56  }
57 
58 
59  inline char inv_nuc_hash(uint8_t hash) {
60  switch (hash) {
61  case 0: return 'A';
62  case 1: return 'C';
63  case 2: return 'G';
64  case 3: return 'T';
65  default : return 4;
66  }
67  }
68 
69 
70  inline char complement(char nuc) {
71  switch (nuc) {
72  case 'A': return 'T';
73  case 'C': return 'G';
74  case 'G': return 'C';
75  case 'T': return 'A';
76  default : return NULL_CHAR;
77  }
78  }
79 
80 
81  std::string translate(const std::string &nuc_seq) {
82  return CodonTable::table().translate(nuc_seq);
83  }
84 
85 
86  prob_t loglikelihood(const std::vector<prob_t> &vec, prob_t laplace = 1e-80) {
87  prob_t res = 0;
88  for (size_t i = 0; i < vec.size(); ++i) {
89 // res += log10((vec[i] + laplace));
90 
91  if (vec[i] && !std::isnan(vec[i])) {
92  res += log10(vec[i]);
93  }
94 
95 // if ((vec[i] > 1e-60) && !std::isnan(vec[i])) {
96 // res += log10(vec[i]);
97 // }
98  } // what to do in case of high precision numbers?
99  // w/ laplace and w/o zero check -77603.267; -100500.399
100  // w/o laplace and w/ zero check -23443.267; -46340.3987
101  // w/o laplace and w/ eps check -23443.267; -46340.3987
102  return res;
103  }
104 
105 
106  void prob_summary(const std::vector<prob_t> &prob_vec, prob_t prev_ll = 0) {
107  size_t zeros = 0, negative = 0, bignums = 0, nans = 0;
108 
109  for (size_t i = 0; i < prob_vec.size(); ++i) {
110  zeros += prob_vec[i] ? 0 : 1;
111  negative += prob_vec[i] < 0 ? 1 : 0;
112  bignums += prob_vec[i] > 1 ? 1 : 0;
113  nans += std::isnan(prob_vec[i]) ? 1 : 0;
114  }
115 
116  std::cout << "Loglikelihood:\t" << std::setprecision(9) << loglikelihood(prob_vec);
117  if (prev_ll) {
118  if (std::abs(loglikelihood(prob_vec) - prev_ll) < 1e-5) { std::cout << " (exact)"; }
119  else if (loglikelihood(prob_vec) > prev_ll) { std::cout << " (grows)"; }
120  else { std::cout << " (drops)"; }
121  }
122  std::cout << std::endl;
123  std::cout << "Error probabilities:\t" << (size_t) (zeros + negative + bignums + nans) << std::endl;
124  if (zeros) std::cout << " Zeros: \t" << (size_t) zeros << std::endl;
125  if (nans) std::cout << " NaNs: \t" << (size_t) nans << std::endl;
126  if (negative) std::cout << " Negatives: \t" << (size_t) negative << std::endl;
127  if (bignums) std::cout << " Bigger than one: \t" << (size_t) bignums << std::endl;
128  }
129 
130 
131  inline bool is_out_of_frame(const std::string &sequence) {
132  return sequence.size() % 3 != 0;
133  }
134 
135 
136  inline bool has_end_codon(const std::string &sequence) {
137  for (size_t i = 0; i < sequence.size(); i += 3) {
138  if (sequence[i] == 'T'
139  && ((sequence[i+1] == 'A' && sequence[i+2] == 'G')
140  || (sequence[i+1] == 'A' && sequence[i+2] == 'A')
141  || (sequence[i+1] == 'G' && sequence[i+2] == 'A')))
142  {
143  return true;
144  }
145  }
146  return false;
147  }
148 
149  inline bool has_bad_aa_codons(const sequence_t &sequence) {
150  for (size_t i = 0; i < sequence.size(); ++i) {
151  if (sequence[i] == '~' || sequence[i] == '*') {
152  return true;
153  }
154  }
155  return false;
156  }
157 
158 
159  inline bool has_oof_aa_codon(const sequence_t &sequence) {
160  for (size_t i = 0; i < sequence.size(); ++i) {
161  if (sequence[i] == '~') {
162  return true;
163  }
164  }
165  return false;
166  }
167 
168 
169 }
170 
171 #endif
Definition: aligner.h:37