24 #ifndef _ASSEMBLINGSTATISTICALMODEL_H_ 25 #define _ASSEMBLINGSTATISTICALMODEL_H_ 28 #include <unordered_set> 30 #include "maagbuilder.h" 31 #include "repertoire.h" 32 #include "clonotypeassembler.h" 33 #include "model_parser.h" 55 _model_path = folderpath +
"/";
58 _recomb = UNDEF_RECOMB;
65 _parser.reset(
new VJModelParser(_model_path, _config, behav));
73 _status = _parser->parse();
77 cout <<
"Model is loaded successfully." << endl;
78 _parser->swap_genes(_genes);
79 _parser->swap_parameters(_param_vec);
83 cout <<
"Model is not loaded due to errors." << endl;
110 ErrorMode error_mode,
111 SequenceType sequence_type = NUCLEOTIDE,
112 MAAGComputeProbAction action = SUM_PROBABILITY)
const {
113 return _builder->buildAndCompute(repertoire, error_mode, sequence_type, action);
127 MetadataMode save_metadata,
128 ErrorMode error_mode,
129 SequenceType sequence_type = NUCLEOTIDE,
130 bool verbose =
true)
const {
133 throw(std::runtime_error(
"Can't build graphs due to a model's failed status!"));
136 return _builder->build(repertoire, save_metadata, error_mode, sequence_type, verbose);
144 _builder->updateEventProbabilities(repertoire, verbose);
159 throw(std::runtime_error(
"Can't generate sequences due to a model's failed status!"));
163 return _generator->generate(count, verbose);
175 throw(std::runtime_error(
"Can't access gene segments in a model due to its failed status!"));
188 throw(std::runtime_error(
"Can't access the event probabilities vector in a model due to its failed status!"));
201 if (_param_vec->recombination() != vec.recombination()) {
202 throw(std::runtime_error(
"Model parameter vectors are not comparable due to different recombination types!"));
205 if (_param_vec->size() != vec.size()) {
206 throw(std::runtime_error(
"Model parameter vectors are not comparable due to different sizes!"));
211 _builder->updateModelParameterVector(vec);
212 _generator->updateModelParameterVector(vec);
223 if (_recomb == VJ_RECOMB) {
225 _param_vec->familyFill(VJ_VAR_JOI_GEN, 0);
226 for (
size_t i = 0; i < nonc.size(); ++i) {
227 uint n_vj = nonc[i].nVar() * nonc[i].nJoi();
228 for (seg_index_t v_i = 0; v_i < nonc[i].nVar(); ++v_i) {
229 for (seg_index_t j_i = 0; j_i < nonc[i].nJoi(); ++j_i) {
230 (*_param_vec)[_param_vec->event_index(VJ_VAR_JOI_GEN,
232 nonc[i].getVar(v_i) - 1,
233 nonc[i].getJoi(j_i) - 1)] += 1. / n_vj;
237 _param_vec->normaliseEventClass(VJ_VAR_JOI_GEN);
238 }
else if (_recomb == VDJ_RECOMB) {
240 _param_vec->familyFill(VDJ_VAR_GEN, 0);
241 for (
size_t i = 0; i < nonc.size(); ++i) {
242 for (seg_index_t v_i = 0; v_i < nonc[i].nVar(); ++v_i) {
243 (*_param_vec)[_param_vec->event_index(VDJ_VAR_GEN,
245 nonc[i].getVar(v_i) - 1)] += 1. / nonc[i].nVar();
248 _param_vec->normaliseEventClass(VDJ_VAR_GEN);
251 _param_vec->familyFill(VDJ_JOI_DIV_GEN, 0);
252 for (
size_t i = 0; i < nonc.size(); ++i) {
253 uint n_jd = nonc[i].nJoi() * nonc[i].nDiv();
254 for (seg_index_t j_i = 0; j_i < nonc[i].nJoi(); ++j_i) {
255 for (seg_index_t d_i = 0; d_i < nonc[i].nDiv(); ++d_i) {
256 (*_param_vec)[_param_vec->event_index(VDJ_JOI_DIV_GEN,
258 nonc[i].getJoi(j_i) - 1,
259 nonc[i].getDiv(d_i) - 1)] += 1. / n_jd;
263 _param_vec->normaliseEventClass(VDJ_JOI_DIV_GEN);
275 bool save(
const string& folderpath)
const {
277 ofs.open(folderpath +
"/model.json");
284 if (_recomb == VJ_RECOMB) {
285 this->save_vj(folderpath);
286 }
else if (_recomb == VDJ_RECOMB) {
287 this->save_vdj(folderpath);
289 std::cout <<
"[ERROR] Can't save a model with an undefined recombination type." << std::endl;
296 std::cout <<
"[ERROR] Problem with saving a .json model file: probably there is no such directory: " << folderpath << std::endl;
309 Recombination recombination()
const {
return _recomb; }
312 string name()
const {
return _config.get(
"name",
"Nameless model").asString(); }
316 bool _status, _verbose;
317 ModelBehaviour _behaviour;
320 Recombination _recomb;
322 unique_ptr<ModelParser> _parser;
324 unique_ptr<VDJRecombinationGenes> _genes;
325 unique_ptr<ModelParameterVector> _param_vec;
327 unique_ptr<MAAGBuilder> _builder;
328 unique_ptr<ClonotypeAssembler> _generator;
330 seq_len_t _min_D_len;
343 string jsonpath = folderpath +
"/model.json";
350 if (_config.get(
"recombination",
"undefined").asString() ==
"VJ") {
353 }
else if (_config.get(
"recombination",
"undefined").asString() ==
"VDJ") {
354 _recomb = VDJ_RECOMB;
357 _recomb = UNDEF_RECOMB;
361 cout <<
"Probabilistic assembling model:\n\t" <<
362 _config.get(
"name",
"Nameless model").asString() <<
364 _config.get(
"comment",
"").asString() <<
366 "Path : " << folderpath <<
368 "Specimen : " << _config.get(
"specimen",
"hobbit").asString() <<
370 _config.get(
"recombination",
"undefined").asString() <<
371 "-recombination | " <<
372 ((_config.get(
"errors", 0).asDouble() != 0) ?
"Sequence error model" :
"No sequence error model") <<
373 endl <<
"\tFiles:"<< endl;
377 std::cout <<
"[ERROR] Probabilistic assembling model error:" << endl <<
"\t config .json file not found at [" << jsonpath <<
"]" << std::endl;
386 _builder.reset(
new MAAGBuilder(*_param_vec, *_genes));
398 void save_vj(
const string &folderpath)
const {
401 _genes->write(folderpath + _config.get(
"segments", Json::Value(
"")).get(
"variable", Json::Value(
"")).get(
"file",
"vsegments.txt").asString(),
402 folderpath + _config.get(
"segments", Json::Value(
"")).get(
"joining", Json::Value(
"")).get(
"file",
"jsegments.txt").asString());
406 container->addMetadata(_genes->J().max());
407 container->addColumnName(
"V / J");
408 for (
auto i = 1; i <= _genes->V().max(); ++i){
409 container->addRowName(_genes->V()[i].allele);
411 for (
auto i = 1; i <= _genes->J().max(); ++i){
412 container->addColumnName(_genes->J()[i].allele);
414 container->addDataVector(_param_vec->get_iterator(1),
415 _param_vec->get_iterator(_param_vec->eventClassSize(VJ_VAR_JOI_GEN) + 1));
416 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"v.j", Json::Value()).get(
"file", .0).asString());
422 for (
auto i = 0; i <= _genes->V().maxLength(); ++i) {
423 container->addRowName(std::to_string(i));
425 container->addColumnName(
"V deletions");
427 for (
auto i = 1; i <= _genes->V().max(); ++i) {
428 container->addColumnName(_genes->V()[i].allele);
429 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VJ_VAR_DEL, i - 1, 0)),
430 _param_vec->get_iterator(_param_vec->event_index(VJ_VAR_DEL, i - 1, 0) + _param_vec->eventFamilySize(VJ_VAR_DEL, i - 1)));
433 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"v.del", Json::Value()).get(
"file", .0).asString());
439 for (
auto i = 0; i <= _genes->J().maxLength(); ++i) {
440 container->addRowName(std::to_string(i));
442 container->addColumnName(
"J deletions");
443 for (
auto i = 1; i <= _genes->J().max(); ++i){
444 container->addColumnName(_genes->J()[i].allele);
445 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VJ_JOI_DEL, i - 1, 0)),
446 _param_vec->get_iterator(_param_vec->event_index(VJ_JOI_DEL, i - 1, 0) + _param_vec->eventFamilySize(VJ_JOI_DEL, i - 1)));
448 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"j.del", Json::Value()).get(
"file", .0).asString());
453 container->addColumnName(
"VJ ins len");
454 container->addColumnName(
"Probability");
455 for (
auto i = 0; i < _param_vec->eventClassSize(VJ_VAR_JOI_INS_LEN); ++i) {
456 container->addRowName(std::to_string(i));
458 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_LEN, 0, 0)),
459 _param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_LEN, 0, 0) + _param_vec->eventFamilySize(VJ_VAR_JOI_INS_LEN, 0)));
460 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"ins.len", Json::Value()).get(
"file", .0).asString());
465 container->addColumnName(
"VJ nucs");
466 container->addColumnName(
"Probability");
467 container->addRowName(
"A"); container->addRowName(
"C"); container->addRowName(
"G"); container->addRowName(
"T");
468 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_NUC, 0, 0)),
469 _param_vec->get_iterator(_param_vec->event_index(VJ_VAR_JOI_INS_NUC, 0, 0) + 4));
470 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"ins.nucl", Json::Value()).get(
"file", .0).asString());
476 void save_vdj(
const string &folderpath)
const {
479 _genes->write(folderpath + _config.get(
"segments", Json::Value(
"")).get(
"variable", Json::Value(
"")).get(
"file",
"vsegments.txt").asString(),
480 folderpath + _config.get(
"segments", Json::Value(
"")).get(
"joining", Json::Value(
"")).get(
"file",
"jsegments.txt").asString(),
481 folderpath + _config.get(
"segments", Json::Value(
"")).get(
"diversity", Json::Value(
"")).get(
"file",
"dsegments.txt").asString());
485 container->addColumnName(
"V");
486 container->addColumnName(
"Probability");
487 container->addDataVector(vector<prob_t>());
488 for (
auto i = 1; i <= _genes->V().max(); ++i) {
489 container->addRowName(_genes->V()[i].allele);
490 container->addDataValue(_param_vec->event_prob(VDJ_VAR_GEN, 0, i - 1));
492 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"v", Json::Value()).get(
"file", .0).asString());
497 container->addMetadata(_genes->D().max());
498 container->addColumnName(
"D / J");
499 for (
auto i = 1; i <= _genes->J().max(); ++i) {
500 container->addRowName(_genes->J()[i].allele);
502 for (
auto i = 1; i <= _genes->D().max(); ++i) {
503 container->addColumnName(_genes->D()[i].allele);
505 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_JOI_DIV_GEN, 0, 0)),
506 _param_vec->get_iterator(_param_vec->eventClassSize(VDJ_JOI_DIV_GEN) + _param_vec->event_index(VDJ_JOI_DIV_GEN, 0, 0)));
507 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"j.d", Json::Value()).get(
"file", .0).asString());
512 for (
auto i = 0; i <= _genes->V().maxLength(); ++i) {
513 container->addRowName(std::to_string(i));
515 container->addColumnName(
"V deletions");
516 for (
auto i = 1; i <= _genes->V().max(); ++i) {
517 container->addColumnName(_genes->V()[i].allele);
518 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DEL, i - 1, 0)),
519 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DEL, i - 1, 0) + _param_vec->eventFamilySize(VDJ_VAR_DEL, i - 1)));
521 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"v.del", Json::Value()).get(
"file", .0).asString());
526 for (
auto i = 0; i <= _genes->J().maxLength(); ++i) {
527 container->addRowName(std::to_string(i));
529 container->addColumnName(
"J deletions");
530 for (
auto i = 1; i <= _genes->J().max(); ++i){
531 container->addColumnName(_genes->J()[i].allele);
532 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_JOI_DEL, i - 1, 0)),
533 _param_vec->get_iterator(_param_vec->event_index(VDJ_JOI_DEL, i - 1, 0) + _param_vec->eventFamilySize(VDJ_JOI_DEL, i - 1)));
535 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"j.del", Json::Value()).get(
"file", .0).asString());
540 for (
auto i = 1; i <= _genes->D().max(); ++i) {
541 container->addRowName(_genes->D()[i].allele);
542 container->addMetadata(_param_vec->n_columns(VDJ_DIV_DEL, i - 1));
543 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_DEL, i - 1, 0, 0)),
544 _param_vec->get_iterator(_param_vec->eventFamilySize(VDJ_DIV_DEL, i - 1) + _param_vec->event_index(VDJ_DIV_DEL, i - 1, 0, 0)));
546 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"d.del", Json::Value()).get(
"file", .0).asString());
551 container->addColumnName(
"Insertion length");
552 container->addColumnName(
"VD ins");
553 container->addColumnName(
"DJ ins");
554 for (
auto i = 0; i < _param_vec->eventClassSize(VDJ_VAR_DIV_INS_LEN); ++i) {
555 container->addRowName(std::to_string(i));
557 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_LEN, 0, 0)),
558 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_LEN, 0, 0) + _param_vec->eventFamilySize(VDJ_VAR_DIV_INS_LEN, 0)));
559 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_LEN, 0, 0)),
560 _param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_LEN, 0, 0) + _param_vec->eventFamilySize(VDJ_DIV_JOI_INS_LEN, 0)));
561 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"ins.len", Json::Value()).get(
"file", .0).asString());
566 container->addColumnName(
"Prev / Next");
567 container->addColumnName(
"VD.A"); container->addColumnName(
"VD.C"); container->addColumnName(
"VD.G"); container->addColumnName(
"VD.T");
568 container->addColumnName(
"DJ.A"); container->addColumnName(
"DJ.C"); container->addColumnName(
"DJ.G"); container->addColumnName(
"DJ.T");
569 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_A, 0, 0)),
570 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_A, 0, 0) + 4));
571 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_C, 0, 0)),
572 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_C, 0, 0) + 4));
573 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_G, 0, 0)),
574 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_G, 0, 0) + 4));
575 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_T, 0, 0)),
576 _param_vec->get_iterator(_param_vec->event_index(VDJ_VAR_DIV_INS_NUC_T, 0, 0) + 4));
577 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_A, 0, 0)),
578 _param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_A, 0, 0) + 4));
579 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_C, 0, 0)),
580 _param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_C, 0, 0) + 4));
581 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_G, 0, 0)),
582 _param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_G, 0, 0) + 4));
583 container->addDataVector(_param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_T, 0, 0)),
584 _param_vec->get_iterator(_param_vec->event_index(VDJ_DIV_JOI_INS_NUC_T, 0, 0) + 4));
585 container->write(folderpath + _config.get(
"probtables", Json::Value()).get(
"ins.nucl", Json::Value()).get(
"file", .0).asString());
Definition: maagbuilder.h:56
ProbabilisticAssemblingModel(const string &folderpath, ModelBehaviour behav=PREDEFINED)
Model constructor from the given folder with a model's parameters.
Definition: probabilisticassemblingmodel.h:54
bool status() const
Return if the model is in the ready-to-work state.
Definition: probabilisticassemblingmodel.h:306
Definition: textdata.h:540
Vector of gene segments.
Definition: textdata.h:138
void updateGeneUsage(const ClonesetView &cloneset)
Given the cloneset, compute its gene usage on out-of-frames and update the event probability vector w...
Definition: probabilisticassemblingmodel.h:220
Definition: genesegment.h:265
Definition: model_parser.h:574
const ModelParameterVector & event_probabilities() const
Access to vector of probabilities.
Definition: probabilisticassemblingmodel.h:185
Definition: textdata.h:28
void updateEventProbabilities(MAAGRepertoire *repertoire, bool verbose=true)
Update event probabilities in the given MAAG repertoire with new ones.
Definition: probabilisticassemblingmodel.h:143
Definition: textdata.h:401
void parseModelConfig(const string &folderpath)
Parse JSON file with model parameters.
Definition: probabilisticassemblingmodel.h:342
MAAGRepertoire buildGraphs(const ClonesetView &repertoire, MetadataMode save_metadata, ErrorMode error_mode, SequenceType sequence_type=NUCLEOTIDE, bool verbose=true) const
Build a set of MAAGs from the given cloneset.
Definition: probabilisticassemblingmodel.h:126
Definition: model_parser.h:343
ProbabilisticAssemblingModel()
Private default constructor.
Definition: probabilisticassemblingmodel.h:336
bool save(const string &folderpath) const
Save the model to the given folder.
Definition: probabilisticassemblingmodel.h:275
Class for storing parameters of assembling statistical model. Note: event with index 0 (zero) is "nul...
Definition: modelparametervector.h:68
List of std::vectors for deletions and insertions.
Definition: textdata.h:258
Definition: probabilisticassemblingmodel.h:41
const VDJRecombinationGenes & gene_segments() const
Access to the gene segments table.
Definition: probabilisticassemblingmodel.h:172
Definition: repertoire.h:171
void make_assembler()
Make ClonotypeAssembler with this model's parameters (event probabilities and gene segments)...
Definition: probabilisticassemblingmodel.h:393
Definition: repertoire.h:51
void make_builder()
Make MAAGBuilder with this model's parameters (event probabilities and gene segments).
Definition: probabilisticassemblingmodel.h:385
vector< prob_t > computeFullProbabilities(const ClonesetView &repertoire, ErrorMode error_mode, SequenceType sequence_type=NUCLEOTIDE, MAAGComputeProbAction action=SUM_PROBABILITY) const
Compute full assembling probabilities of clonotypes from the given repertoire. If some clonotypes hav...
Definition: probabilisticassemblingmodel.h:109
Cloneset generateSequences(size_t count=1, bool verbose=true) const
Generate artificial repertoire of sequences using this model.
Definition: probabilisticassemblingmodel.h:156
void updateModelParameterVector(const ModelParameterVector &vec)
Set a new event probabilities vector to this model.
Definition: probabilisticassemblingmodel.h:199
Definition: clonotypeassembler.h:17