Ymir  .9
Fast\C++toolforcomputationofassemblingprobabilities,statisticalinferenceofassemblingstatisticalmodelandgenerationofartificialsequencesofT-cellreceptorsdata.
em_algorithm.h
1 //
2 // Created by Vadim N. on 11/03/2016.
3 //
4 
5 #ifndef YMIR_EM_ALGORITHM_H
6 #define YMIR_EM_ALGORITHM_H
7 
8 
9 #include "statisticalinferencealgorithm.h"
10 
11 
12 namespace ymir {
13 
14  class EMAlgorithm;
15 
16 
24  public:
25 
29  virtual std::vector<prob_t> statisticalInference(const ClonesetView &repertoire,
31  const AlgorithmParameters &algo_param = AlgorithmParameters().set("niter", 10).set("sample", 50000),
32  ErrorMode error_mode = NO_ERRORS) const {
33  cout << "Statistical inference on a PAM:\t" << model.name() << endl;
34  cout << "\tMurugan EM-algorithm.";
35  if (error_mode == COMPUTE_ERRORS) {
36  cout << "\t(with sequence errors)";
37  }
38  std::cout << std::endl;
39 
40  if (!algo_param.check("niter") && !algo_param.check("sample")) {
41  return std::vector<prob_t>();
42  }
43 
44 
45  size_t sample = algo_param["sample"].asUInt();
46  ClonesetView rep_nonc = repertoire.noncoding().sample(sample);
47  cout << "Number of noncoding clonotypes:\t" << (size_t) rep_nonc.size() << endl;
48 
49  std::vector<prob_t> logLvec;
50 
51 
52  ModelParameterVector new_param_vec = model.event_probabilities();
53  new_param_vec.fill(1);
54  new_param_vec.set_error_prob(.0003);
55  new_param_vec.normaliseEventFamilies();
56  model.updateModelParameterVector(new_param_vec);
57 
58  MAAGRepertoire maag_rep = model.buildGraphs(rep_nonc, SAVE_METADATA, error_mode, NUCLEOTIDE, true);
59 
60  vector<prob_t> prob_vec;
61 
62  prob_t prev_ll = 0, cur_ll = 0;
63 
64  cout << "Computing full assembling probabilities..." << endl;
65  vector<bool> good_clonotypes;
66  size_t removed, zero_prob, no_alignments;
67  this->filterOut(rep_nonc, maag_rep, prob_vec, good_clonotypes, removed, zero_prob, no_alignments);
68 
69  cout << endl << "Initial data summary:" << endl;
70  prob_summary(prob_vec);
71  std::cout << model.event_probabilities().error_prob() << std::endl;
72  prev_ll = loglikelihood(prob_vec);
73  logLvec.push_back(prev_ll);
74 
75  std::chrono::system_clock::time_point tp1, tp2;
77  for (size_t iter = 1; iter <= algo_param["niter"].asUInt(); ++iter) {
78  cout << endl << "Iteration:\t" << (size_t) iter << endl;
79 
80  new_param_vec.fill(0);
81 
82  tp1 = std::chrono::system_clock::now();
83  for (size_t i = 0; i < maag_rep.size(); ++i) {
84  if ((i+1) % 25000 == 0) {
85  cout << "Processed " << (int) i << " / " << (int) maag_rep.size() << " MAAGs.\t" << (std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()) - std::chrono::system_clock::to_time_t(tp1)) << " sec." << endl;
86  }
87  if (good_clonotypes[i]) {
88  if(!this->updateTempVec(fb, maag_rep[i], new_param_vec, error_mode)) {
89  cout << "bad maag:\t" << (int) i << endl;
90  }
91  }
92  }
93 
94  this->updateModel(model, new_param_vec, maag_rep, prob_vec, prev_ll, removed, error_mode);
95 
96  std::cout << new_param_vec.error_prob() << std::endl;
97 
98  logLvec.push_back(prev_ll);
99  }
100 
101  cout << endl << "Done. Resulting loglikelihood:\t" << loglikelihood(prob_vec) << endl << endl;
102 
103  return logLvec;
104  }
105 
106 
107  protected:
108 
109  bool updateTempVec(MAAGForwardBackwardAlgorithm &fb,
110  MAAG &maag,
111  ModelParameterVector &new_param_vec,
112  ErrorMode error_mode) const
113  {
114  if (!fb.process(maag, error_mode)) {
115  return false;
116  }
117 
118  while (!fb.is_empty()) {
119  event_pair_t ep = fb.nextEvent();
120  new_param_vec[ep.first] += ep.second;
121 
122  if (std::isnan(ep.second)) {
123  return false;
124  }
125  }
126 
127  if (error_mode) { new_param_vec.set_error_prob(new_param_vec.error_prob() + fb.err_prob()); }
128 
129  if (maag.is_vj()) {
130  new_param_vec[new_param_vec.event_index(VJ_VAR_JOI_INS_NUC, 0, 0)] += fb.VJ_nuc_probs()[0];
131  new_param_vec[new_param_vec.event_index(VJ_VAR_JOI_INS_NUC, 0, 1)] += fb.VJ_nuc_probs()[1];
132  new_param_vec[new_param_vec.event_index(VJ_VAR_JOI_INS_NUC, 0, 2)] += fb.VJ_nuc_probs()[2];
133  new_param_vec[new_param_vec.event_index(VJ_VAR_JOI_INS_NUC, 0, 3)] += fb.VJ_nuc_probs()[3];
134  } else {
135  int k = 0;
136  for (auto prev_nuc = 0; prev_nuc < 4; ++prev_nuc) {
137  for (auto next_nuc = 0; next_nuc < 4; ++next_nuc, ++k) {
138  new_param_vec[new_param_vec.event_index(VDJ_VAR_DIV_INS_NUC, prev_nuc, next_nuc)] += fb.VD_nuc_probs()[k];
139  }
140  }
141 
142  k = 0;
143  for (auto prev_nuc = 0; prev_nuc < 4; ++prev_nuc) {
144  for (auto next_nuc = 0; next_nuc < 4; ++next_nuc, ++k) {
145  new_param_vec[new_param_vec.event_index(VDJ_DIV_JOI_INS_NUC, prev_nuc, next_nuc)] += fb.DJ_nuc_probs()[k];
146  }
147  }
148  }
149 
150  return true;
151  }
152 
153 
154  void updateModel(ProbabilisticAssemblingModel &model,
155  ModelParameterVector &new_param_vec,
156  MAAGRepertoire &maag_rep,
157  vector<prob_t> &prob_vec,
158  prob_t &prev_ll,
159  size_t removed,
160  ErrorMode error_mode) const
161  {
162 // if (error_mode) { new_param_vec.set_error_prob(new_param_vec.error_prob() / (maag_rep.size() - removed)); }
163  new_param_vec.set_error_prob(.0003);
164 
165  new_param_vec.normaliseEventFamilies();
166 
167  model.updateModelParameterVector(new_param_vec);
168  model.updateEventProbabilities(&maag_rep);
169 
170  for (size_t i = 0; i < maag_rep.size(); ++i) {
171  prob_vec[i] = maag_rep[i].fullProbability();
172  }
173  prob_summary(prob_vec, prev_ll);
174  prev_ll = loglikelihood(prob_vec);
175  }
176 
177  };
178 
179 }
180 
181 #endif //YMIR_EM_ALGORITHM_H
Definition: aligner.h:37
Interface for algorithms for statistical inference of assembling model parameters.
Definition: statisticalinferencealgorithm.h:46
event_ind_t event_index(EventClass event_class, event_ind_t event_family, event_ind_t event_index) const
Definition: modelparametervector.h:261
Definition: maagforwardbackwardalgorithm.h:20
Multi-Alignment Assembly Graph - basic class for representing all possible generation scenarios of a ...
Definition: maag.h:46
const ModelParameterVector & event_probabilities() const
Access to vector of probabilities.
Definition: probabilisticassemblingmodel.h:185
void updateEventProbabilities(MAAGRepertoire *repertoire, bool verbose=true)
Update event probabilities in the given MAAG repertoire with new ones.
Definition: probabilisticassemblingmodel.h:143
Definition: statisticalinferencealgorithm.h:49
void normaliseEventFamilies()
Normalise each event family to have sum equal to 1.
Definition: modelparametervector.h:316
MAAGRepertoire buildGraphs(const ClonesetView &repertoire, MetadataMode save_metadata, ErrorMode error_mode, SequenceType sequence_type=NUCLEOTIDE, bool verbose=true) const
Build a set of MAAGs from the given cloneset.
Definition: probabilisticassemblingmodel.h:126
Class for storing parameters of assembling statistical model. Note: event with index 0 (zero) is "nul...
Definition: modelparametervector.h:68
Definition: probabilisticassemblingmodel.h:41
Definition: repertoire.h:51
void fill(prob_t val=0)
Fill the vector with the given value.
Definition: modelparametervector.h:377
void updateModelParameterVector(const ModelParameterVector &vec)
Set a new event probabilities vector to this model.
Definition: probabilisticassemblingmodel.h:199
Implementation of the EM-algorithm for statistical inference of assembling model parameters. Classic version described in (Murugan et al 2012)
Definition: em_algorithm.h:23