LocARNA-1.8.11
exact_matcher.hh
1 #ifndef EXACT_MATCHER_HH
2 #define EXACT_MATCHER_HH
3 
4 #ifdef HAVE_CONFIG_H
5 # include <config.h>
6 #endif
7 
8 #include <iostream>
9 #include <sstream>
10 #include <list>
11 #include <algorithm>
12 #include <limits>
13 #include <iterator>
14 #include "trace_controller.hh"
15 #include "sparsification_mapper.hh"
16 #include "tuples.hh"
17 #include "scoring.hh"
18 #include "ext_rna_data.hh"
19 #include "aux.hh"
20 
21 extern "C" {
22 # include <ViennaRNA/fold_vars.h>
23 # include <ViennaRNA/utils.h>
24 # include <ViennaRNA/PS_dot.h>
25 # include <ViennaRNA/fold.h>
26  int PS_rna_plot(char *string, char *structure, char *file);
27  int PS_rna_plot_a(char *string, char *structure, char *file, char *pre, char *post);
28  float fold(const char *sequence, char *structure);
29 }
30 
31 namespace LocARNA {
32 
33  typedef size_t size_type;
34  typedef std::vector<unsigned int> intVec;
35  typedef std::pair<unsigned int,unsigned int> intPair;
36  typedef std::pair<intPair, intPair> intPPair;
37  typedef const intPPair* intPPairPTR;
38  typedef std::vector<intPPair>::const_iterator IntPPairCITER;
39 
40 
45  {
46  public:
47  SinglePattern(){};
48 
55  SinglePattern(const std::string& myId_,const std::string& seqId_,const intVec& mySinglePattern_)
56  :myId(myId_),seqId(seqId_),pattern(mySinglePattern_)
57  {};
58 
62  virtual ~SinglePattern() { pattern.clear(); };
63 
68  const std::string& getmyId() const { return myId; };
69 
74  const std::string& getseqId() const {return seqId; };
75 
80  const intVec& getPat() const { return pattern; };
81 
82  private:
83 
84  std::string myId;
85  std::string seqId;
86  intVec pattern;
87  };
88 
93  {
94  public:
95  PatternPair(){};
96 
105  PatternPair(const std::string& myId,const SinglePattern& myFirstPat,const SinglePattern& mySecPat, const std::string& structure_, int& score_)
106  : id(myId),first(myFirstPat),second(mySecPat), structure(structure_), EPMscore(score_)
107  {
108  if (first.getPat().size() != second.getPat().size()){
109  std::cerr << "Error! PatternPair cannot be constructed due to different sizes of SinglePatterns!" << std::endl;
110  }
111  score = EPMscore;
112  size = first.getPat().size();
113  };
114 
116  virtual ~PatternPair()
117  {
118  insideBounds.clear();
119  };
120 
125  const std::string& getId() const { return id; };
126 
131  const int& getSize() const { return size; };
132 
137  const SinglePattern& getFirstPat() const { return first; };
138 
143  const SinglePattern& getSecPat() const { return second;};
144 
146  void resetBounds();
147 
152  void setOutsideBounds(intPPair myPPair);
153 
158  const intPPair getOutsideBounds() const { return outsideBounds; };
159 
161  void addInsideBounds(intPPair myPPair);
162 
167  const std::vector<intPPair>& getInsideBounds() const { return insideBounds; };
168 
173  void setEPMScore(int myScore);
174 
179  const int getScore() const { return score; };
180 
185  const int getEPMScore() const { return EPMscore; };
186 
191  const std::string& get_struct() const{return structure;};
192 
193  private:
194  std::string id;
195  int size;
196  SinglePattern first;
197  SinglePattern second;
198 
199  std::string structure;
200  int score;
201  int EPMscore;
202  std::vector<intPPair> insideBounds;
203  intPPair outsideBounds;
204  };
205 
210  {
211  public:
214 
215  typedef std::multimap<int,SelfValuePTR,std::greater<int> > orderedMapTYPE;
216  typedef orderedMapTYPE::const_iterator orderedMapCITER;
217  typedef orderedMapTYPE::iterator orderedMapITER;
218  typedef std::list<SelfValuePTR> patListTYPE;
219  typedef patListTYPE::iterator patListITER;
220  typedef patListTYPE::const_iterator patListCITER;
222 
223 
225  PatternPairMap();
226 
229  PatternPairMap(const PatternPairMap& myPairMap)
230  :patternList(myPairMap.patternList),
231  patternOrderedMap(myPairMap.patternOrderedMap),
232  idMap(myPairMap.idMap) { minPatternSize = 100000;};
233 
235  virtual ~PatternPairMap();
236 
245  void add( const std::string& id,
246  const SinglePattern& first,
247  const SinglePattern& second,
248  const std::string& structure,
249  int score
250  );
251 
256  void add(const SelfValuePTR value);
257 
259  void makeOrderedMap();
260 
262  void updateFromMap();
263 
269  const PatternPair& getPatternPair(const std::string& id)const;
270 
276  const SelfValuePTR getPatternPairPTR(const std::string& id)const;
277 
282  const patListTYPE& getList() const;
283 
288  const orderedMapTYPE& getOrderedMap() const;
289 
294  orderedMapTYPE& getOrderedMap2();
295 
300  const int size() const;
301 
306  int getMapBases();
307 
312  int getMapEPMScore();
313 
318  const int getMinPatternSize() const { return minPatternSize; };
319 
320  private:
321 
322  patListTYPE patternList;
323  orderedMapTYPE patternOrderedMap;
324  PatternIdMapTYPE idMap;
325  int minPatternSize;
326  };
327 
334  std::ostream &operator << (std::ostream &out, const PatternPairMap::patListTYPE &pat_pair_map);
335 
339  class LCSEPM
340  {
341  public:
342 
350  LCSEPM(const Sequence& seqA_,
351  const Sequence& seqB_,
352  const PatternPairMap& myPatterns,
353  PatternPairMap& myLCSEPM
354  )
355 
356  :seqA(seqA_),
357  seqB(seqB_),
358  matchedEPMs(myLCSEPM),
359  patterns(myPatterns)
360  {};
361 
363  virtual ~LCSEPM();
364 
373  void MapToPS(const std::string& sequenceA, const std::string& sequenceB, PatternPairMap& myMap, const std::string& file1, const std::string& file2);
374 
376  void calculateLCSEPM(bool quiet);
377 
379  std::pair<SequenceAnnotation,SequenceAnnotation>
380  anchor_annotation();
381 
383  void output_locarna(const std::string& sequenceA, const std::string& sequenceB, const std::string& outfile);
384 
386  void output_clustal(const std::string& outfile_name);
387 
388  private:
389 
390  struct HoleCompare2 {
391  bool operator()(const intPPairPTR & h1, const intPPairPTR & h2) const {
392  // first compare size of holes
393  if (h1->first.second - h1->first.first-1 < h2->first.second - h2->first.first-1){
394  return true; }
395  // compare if holes are identical in both structures
396  if (h1->first.second - h1->first.first-1 == h2->first.second - h2->first.first-1){
397  if ((h1->first.first == h2->first.first) && (h1->first.second == h2->first.second) &&
398  (h1->second.first==h2->second.first) && (h1->second.second==h2->second.second))
399  { return true; }
400  }
401 
402  return false;
403  }
404  };
405 
406  typedef std::multimap<intPPairPTR,PatternPairMap::SelfValuePTR,HoleCompare2> HoleOrderingMapTYPE2;
407  typedef HoleOrderingMapTYPE2::const_iterator HoleMapCITER2;
408 
409 
410  void preProcessing ();
411  void calculateHoles3 (bool quiet);
412  void calculatePatternBoundaries (PatternPair* myPair);
413  void calculateTraceback2 (const int i,const int j,const int k,const int l,std::vector < std::vector<int> > holeVec);
414  int D_rec2 (const int& i,const int& j,const int& k,const int& l,std::vector < std::vector<int> >& D_h,const bool debug);
415 
416  int max3 (int a, int b, int c)
417  {
418  int tmp = a>b? a:b;
419  return (tmp>c? tmp:c);
420  };
421 
423  char* getStructure(PatternPairMap& myMap, bool firstSeq, int length);
424 
425  std::string intvec2str(const std::vector<unsigned int>& V, const std::string &delim){
426  std::stringstream oss;
427  copy(V.begin(), V.end(), std::ostream_iterator<unsigned int>(oss, delim.c_str()));
428  std::string tmpstr;
429  tmpstr = oss.str();
430  if (tmpstr.length()>0) tmpstr.erase(tmpstr.end()-1);
431  return tmpstr;
432  }
433 
434  std::string upperCase(const std::string &seq){
435  std::string s= "";
436  for(unsigned int i= 0; i<seq.length(); i++)
437  s+= toupper(seq[i]);
438  return s;
439  }
440 
441  std::vector< std::vector <std::vector<PatternPairMap::SelfValuePTR> > > EPM_Table2;
442  HoleOrderingMapTYPE2 holeOrdering2;
443  const Sequence& seqA;
444  const Sequence& seqB;
445  PatternPairMap& matchedEPMs;
446  const PatternPairMap& patterns;
447  };
448 
449 
454 
455  private:
456 
457  typedef SparsificationMapper::matidx_t matidx_t;
458  typedef SparsificationMapper::seq_pos_t seqpos_t;
459  typedef SparsificationMapper::index_t index_t;
460 
461  public:
462 
463  typedef std::pair<matidx_t,matidx_t> matpos_t;
464  typedef std::pair<seqpos_t,seqpos_t> pair_seqpos_t;
465 
466  private:
467  const SparsificationMapper &sparse_mapperA;
468  const SparsificationMapper &sparse_mapperB;
469 
470  public:
478  SparseTraceController(const SparsificationMapper &sparse_mapperA_,const SparsificationMapper &sparse_mapperB_,const TraceController &trace_controller_):
479  TraceController::TraceController(trace_controller_),
480  sparse_mapperA(sparse_mapperA_),
481  sparse_mapperB(sparse_mapperB_)
482 
483  {
484 
485  }
486 
487  virtual ~SparseTraceController(){};
488 
491  return sparse_mapperA;
492  }
493 
496  return sparse_mapperB;
497  }
498 
499 
509  matidx_t min_col_idx(index_t indexA, index_t indexB, matidx_t idx_i,
510  index_t left_endB = std::numeric_limits<index_t>::max()) const{
511 
512  seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA,idx_i);
513  return sparse_mapperB.idx_geq(indexB,min_col(i),left_endB);
514  }
515 
525  matidx_t idx_after_max_col_idx(index_t indexA, index_t indexB, matidx_t idx_i,
526  index_t left_endB = std::numeric_limits<index_t>::max()) const{
527 
528  seqpos_t i = sparse_mapperA.get_pos_in_seq_new(indexA,idx_i);
529  return sparse_mapperB.idx_after_leq(indexB,max_col(i),left_endB);
530  }
531 
544  matpos_t diag_pos_bef(index_t indexA, index_t indexB,pair_seqpos_t cur_pos_seq,
545  index_t left_endA = std::numeric_limits<index_t>::max(), index_t left_endB = std::numeric_limits<index_t>::max())const{
546 
547  bool debug_valid_mat_pos = false;
548 
549  if(debug_valid_mat_pos) std::cout << "first valid mat pos before with tc " << std::endl;
550 
551  seqpos_t i = cur_pos_seq.first;
552  seqpos_t j = cur_pos_seq.second;
553 
554  matidx_t idx_after_max_col;
555 
556  // find valid matrix position based on the SparsificationMapper
557  matidx_t cur_row = sparse_mapperA.first_valid_mat_pos_before(indexA,i,left_endA);
558  matidx_t col_before = sparse_mapperB.first_valid_mat_pos_before(indexB,j,left_endB);
559 
560  //bool valid_pos_found = false;
561 
562  // find a valid position that is valid also based on the TraceController
563  // go through the rows and find an interval that includes the column col_before or lies
564  // before the column col_before
565  for(;;--cur_row){
566 
567  matidx_t min_col = min_col_idx(indexA,indexB,cur_row,left_endB);
568  idx_after_max_col = idx_after_max_col_idx(indexA,indexB,cur_row,left_endB);
569 
570  if(debug_valid_mat_pos) std::cout << "interval " << min_col << "," << idx_after_max_col << std::endl;
571 
572  // valid interval found
573  if(min_col<idx_after_max_col && min_col<=col_before){
574  //valid_pos_found=true;
575  break;
576  }
577 
578  if(cur_row==0){
579  break;
580  }
581 
582  }
583 
584  //assert(valid_pos_found);
585  assert(idx_after_max_col>0);
586 
587  matidx_t max_col = idx_after_max_col-1;
588 
589  // the column of the new position is the col_before or lies before it
590  matpos_t result = matpos_t(cur_row,std::min(max_col,col_before));
591 
592  assert(is_valid_idx_pos(indexA,indexB,result));
593 
594  return result;
595 
596  }
597 
606  pair_seqpos_t pos_in_seq(index_t idxA, index_t idxB,//const Arc &a, const Arc &b,
607  const matpos_t &cur_pos) const{
608 
609  return pair_seqpos_t(sparse_mapperA.get_pos_in_seq_new(idxA,cur_pos.first),
610  sparse_mapperB.get_pos_in_seq_new(idxB,cur_pos.second));
611  }
612 
624  bool matching_wo_gap(index_t idxA, index_t idxB,
625  matpos_t idx_pos_diag, pair_seqpos_t seq_pos_to_be_matched) const{
626  pair_seqpos_t pos_diag = pos_in_seq(idxA,idxB,idx_pos_diag);
627  return (pos_diag.first+1 == seq_pos_to_be_matched.first) && (pos_diag.second+1==seq_pos_to_be_matched.second);
628  }
629 
636  bool pos_unpaired(index_t idxA, index_t idxB,
637  matpos_t pos) const{
638  return sparse_mapperA.pos_unpaired(idxA,pos.first)
639  && sparse_mapperB.pos_unpaired(idxB,pos.second);
640  }
641 
649  bool is_valid_idx_pos(index_t idxA, index_t idxB,
650  matpos_t mat_pos) const{
651  pair_seqpos_t seq_pos = pos_in_seq(idxA,idxB,mat_pos);
652  return is_valid(seq_pos.first,seq_pos.second);
653  }
654 
655  };
656 
657 
661  class EPM{
662 
663  public:
664  typedef BasePairs__Arc Arc;
665 
670  typedef std::pair<ArcIdx,ArcIdx> PairArcIdx;
671  typedef std::vector<PairArcIdx> PairArcIdxVec;
672 
675 
676  typedef std::vector<el_pat_vec> pat_vec_t;
677 
678  private:
679 
680  pat_vec_t pat_vec;
681 
682  score_t score;
683  int state;
684  matpos_t cur_pos;
685  score_t max_tol_left;
686  bool first_insertion;
687  bool invalid;
688 
690  PairArcIdxVec am_to_do;
691 
693  struct compare_el_pat_vec {
694  public:
695  bool
696  operator () (const EPM::el_pat_vec &el1,const EPM::el_pat_vec &el2)const {
697  seqpos_t el1_pos1 = el1.first;
698  seqpos_t el1_pos2 = el1.second;
699  seqpos_t el2_pos1 = el2.first;
700  seqpos_t el2_pos2 = el2.second;
701  char el1_struc = el1.third;
702  char el2_struc = el2.third;
703  return (el1_pos1 < el2_pos1) || (el1_pos1==el2_pos1 && el1_pos2 < el2_pos2)
704  || (el1_pos1==el2_pos1 && el1_pos2 == el2_pos2 && el1_struc<el2_struc);
705  }
706 
707  };
708 
709  struct compare_el_am_to_do {
710  public:
711  bool
712  operator () (const EPM::PairArcIdx &el1, const EPM::PairArcIdx &el2)const {
713  return (el1.first<el2.first) || ((el1.first==el2.first) && el1.second < el2.second);
714  }
715  };
716 
717  public:
718 
720  EPM():
721  score(0),
722  state(0),
723  cur_pos(matpos_t(0,0)),
724  max_tol_left(0),
725  first_insertion(true),
726  invalid(false)
727  {}
728 
729  virtual ~EPM(){}
730 
731  //-----------------------------------------------------------------------
732  // getter methods
733  //-----------------------------------------------------------------------
734 
736  score_t get_score() const{return score;}
737 
739  int get_state() const{return state;}
740 
742  const matpos_t & get_cur_pos() const{return cur_pos;}
743 
745  const score_t & get_max_tol_left() const{return max_tol_left;}
746 
748  bool get_first_insertion() const{return first_insertion;}
749 
752  bool is_invalid() const{
753  return invalid;
754  }
755 
756  //-----------------------------------------------------------------------
757  // setter methods
758  //-----------------------------------------------------------------------
759 
764  void set_score(score_t score_){ score=score_;}
765 
770  void set_state(int state_){ state=state_;}
771 
776  void set_cur_pos(matpos_t cur_pos_){cur_pos = cur_pos_;}
777 
782  void set_max_tol_left(score_t tol){max_tol_left=tol;}
783 
788  void set_first_insertion(bool first_insertion_){first_insertion=first_insertion_;}
789 
791  void set_invalid(){
792  invalid = true;
793  }
794 
800  const PairArcIdx& get_am(PairArcIdxVec::size_type idx) const{
801  assert(idx<am_to_do.size());
802  return am_to_do[idx];
803  }
804 
809  PairArcIdxVec::size_type number_of_am(){return am_to_do.size();}
810 
812  void clear_am_to_do(){am_to_do.clear();}
813 
818  PairArcIdxVec::const_iterator am_begin() const {return am_to_do.begin();}
819 
824  PairArcIdxVec::const_iterator am_end() const {return am_to_do.end();}
825 
831  el_pat_vec pat_vec_at(pat_vec_t::size_type idx) const{
832  assert(idx<pat_vec.size());
833  return pat_vec[idx];
834  }
835 
840  pat_vec_t::size_type pat_vec_size() const{return pat_vec.size();}
841 
846  pat_vec_t::const_iterator begin() const{return pat_vec.begin();}
847 
852  pat_vec_t::const_iterator end() const{return pat_vec.end();}
853 
854 
859  pair_seqpos_t last_matched_pos(){
860  assert(!pat_vec.empty());
861  return pair_seqpos_t(pat_vec.back().first,pat_vec.back().second);
862  }
863 
870  void add(seqpos_t posA,seqpos_t posB,char c){pat_vec.push_back(el_pat_vec(posA,posB,c));}
871 
879  void overwrite(seqpos_t posA,seqpos_t posB,char c,pat_vec_t::size_type pos){
880  if(pat_vec.size()<=pos){pat_vec.push_back(el_pat_vec(posA,posB,c));}
881  pat_vec.at(pos)=el_pat_vec(posA,posB,c);
882  }
883 
889  void add_am(const Arc &a, const Arc &b){
890  add(a.right(),b.right(),')');
891  add(a.left(),b.left(),'(');
892  }
893 
899  void store_am(const Arc &a, const Arc &b){
900  const PairArcIdx &pair_arc_idx = PairArcIdx(a.idx(),b.idx());
901  //store the pair of arc indices in the am_to_do datastructure
902  am_to_do.push_back(pair_arc_idx);
903  }
904 
909  PairArcIdx next_arcmatch(){
910  PairArcIdx arc_idx = am_to_do.back();
911  am_to_do.pop_back();
912  return arc_idx;
913  }
914 
919  void sort_patVec(){sort(pat_vec.begin(), pat_vec.end(),compare_el_pat_vec());}
920 
925  sort(am_to_do.begin(), am_to_do.end(), compare_el_am_to_do());
926  }
927 
932  void insert_epm(const EPM &epm_to_insert){
933  pat_vec.insert(pat_vec.end(),epm_to_insert.begin(),epm_to_insert.end());
934  }
935 
942  bool includes(const EPM &epm_to_test) const{
943  assert(pat_vec_size()>=epm_to_test.pat_vec_size());
944  return std::includes(this->begin(),this->end(),
945  epm_to_test.begin(),epm_to_test.end(),
946  compare_el_pat_vec());
947  }
948 
955  bool includes_am(const EPM &epm_to_test) const{
956  return std::includes(am_begin(),am_end(),
957  epm_to_test.am_begin(),epm_to_test.am_end(),
958  compare_el_am_to_do());
959  }
960 
966  void print_epm(std::ostream &out, bool verbose) const{
967  out << "_________________________________________________" << std::endl;
968  out << "epm with score " << this->score << std::endl;
969  out << " ";
970  for(pat_vec_t::const_iterator it=pat_vec.begin();it!=pat_vec.end();++it){
971  out << it->first << ":" << it->second << " ";
972  }
973  out << std::endl;
974  out << " ";
975  for(pat_vec_t::const_iterator it=pat_vec.begin();it!=pat_vec.end();++it){
976  out << it->third;
977  }
978  out << std::endl;
979  out << "am_to_do " << am_to_do << std::endl;
980  out << "tolerance left " << this->max_tol_left << std::endl;
981  if(verbose){
982  out << "score " << score << std::endl;
983  out << "pos " << this->cur_pos.first << "," << this->cur_pos.second << std::endl;
984  out << "state " << this->state << std::endl;
985  }
986  out << "______________________________________________________" << std::endl;
987  }
988  };
989 
997  inline bool operator< (const EPM &epm1, const EPM &epm2) {
998  return epm1.get_max_tol_left()>epm2.get_max_tol_left();
999  }
1000 
1007  inline std::ostream & operator << (std::ostream &out, const EPM &epm){
1008  epm.print_epm(out,false);
1009  return out;
1010  }
1011 
1019  template <class T1>
1020  T1 max3(const T1 &first,const T1 &second, const T1 &third){
1021  return max(max(first,second),third);
1022  }
1023 
1032  template <class T1>
1033  T1 max4(const T1 &first,const T1 &second, const T1 &third, const T1 &fourth){
1034  return max(max3(first,second,third),fourth);
1035  }
1036 
1037 
1049  typedef BasePairs__Arc Arc;
1050 
1051  typedef SparsificationMapper::ArcIdx ArcIdx;
1052  typedef SparsificationMapper::ArcIdxVec ArcIdxVec;
1053  typedef SparsificationMapper::matidx_t matidx_t;
1054  typedef SparsificationMapper::seq_pos_t seqpos_t;
1055  typedef SparsificationMapper::index_t index_t;
1056  typedef SparseTraceController::matpos_t matpos_t;
1057  typedef SparseTraceController::pair_seqpos_t pair_seqpos_t;
1058 
1059  typedef EPM::PairArcIdx PairArcIdx;
1060  typedef EPM::PairArcIdxVec PairArcIdxVec;
1061 
1062  typedef std::list<EPM> epm_cont_t;
1063  typedef epm_cont_t::iterator epm_it_t;
1064  typedef std::pair<score_t,epm_cont_t > el_map_am_to_do_t;
1065 
1068  typedef unordered_map<PairArcIdx,el_map_am_to_do_t,pair_of_size_t_hash>::type map_am_to_do_t;
1069  private:
1070 
1073  //<state, max_tol, current matrix position, potential arcMatch, sequence position to be matched>
1075 
1076  //infty_score_t because of the check_poss, change to score_t!!!
1078  typedef triple<int,infty_score_t,matpos_t> poss_in_G;//<state,max_tol,current matrix position>
1079 
1080 
1081  const Sequence &seqA;
1082  const Sequence &seqB;
1083 
1084  const RnaData &rna_dataA;
1085  const RnaData &rna_dataB;
1086 
1087  const ArcMatches &arc_matches;
1088 
1089  const BasePairs &bpsA;
1090  const BasePairs &bpsB;
1091  const SparseTraceController &sparse_trace_controller;
1092  // (valid positions in the matrix)
1093  const SparsificationMapper &sparse_mapperA;
1094  const SparsificationMapper &sparse_mapperB;
1095  PatternPairMap &foundEPMs;
1096 
1097  ScoreMatrix L;
1098  ScoreMatrix G_A;
1099  ScoreMatrix G_AB;
1101  ScoreMatrix LR;
1102  ScoreMatrix F;
1103  ScoreMatrix Dmat;
1104 
1105  int alpha_1;
1106  int alpha_2;
1107  int alpha_3;
1108 
1109  int difference_to_opt_score;
1110  // worse than the optimal score are considered
1111  int min_score;
1112  long int max_number_of_EPMs;
1113  long int cur_number_of_EPMs;
1114 
1115  bool inexact_struct_match;
1116  score_t struct_mismatch_score;
1117  bool add_filter;
1118 
1119  bool verbose;
1120 
1121  pair_seqpos_t pos_of_max;
1122 
1123  enum{in_LR,in_G_A,in_G_AB,in_L,in_F};
1124 
1125  const Arc pseudo_arcA;
1126  const Arc pseudo_arcB;
1127 
1135  const
1136  infty_score_t &D(const ArcMatch &am) const {
1137  return D(am.arcA(),am.arcB());
1138  }
1139 
1147  infty_score_t &D(const ArcMatch &am) {
1148  return D(am.arcA(),am.arcB());
1149  }
1150 
1159  const
1160  infty_score_t &D(const Arc &a, const Arc &b) const {
1161  return Dmat(a.idx(),b.idx());
1162  }
1163 
1164 
1173  infty_score_t &D(const Arc &a, const Arc &b){
1174  return Dmat(a.idx(),b.idx());
1175  }
1176 
1177  bool nucleotide_match(seqpos_t pos_seqA, seqpos_t pos_seqB) const {
1178  assert(pos_seqA>=1 && pos_seqA<=seqA.length() &&
1179  pos_seqB>=1 && pos_seqB<=seqB.length()); //seqA and seqB are 1-based
1180  return (seqA[pos_seqA]==seqB[pos_seqB]);
1181  }
1182 
1183  bool seq_matching(ArcIdx idxA, ArcIdx idxB, matpos_t cur_mat_pos, pair_seqpos_t cur_seq_pos) const {
1184  seqpos_t i = cur_seq_pos.first;
1185  seqpos_t j = cur_seq_pos.second;
1186 
1187  return sparse_trace_controller.pos_unpaired(idxA,idxB,cur_mat_pos) &&
1188  nucleotide_match(i,j);
1189  }
1190 
1191  // ----------------------------------------
1192  // fill matrices
1193 
1194 
1198  void initialize_gap_matrices();
1199 
1201  void init_Fmat();
1202 
1213  void init_mat(ScoreMatrix &mat, const Arc &a, const Arc &b,
1214  infty_score_t first_entry,infty_score_t first_col, infty_score_t first_row);
1215 
1226  pair_seqpos_t compute_LGLR(const Arc &a, const Arc &b, bool suboptimal);
1227 
1240  infty_score_t compute_matrix_entry(const Arc &a, const Arc &b,matpos_t mat_pos,
1241  matpos_t mat_pos_diag, bool matrixLR, bool suboptimal);
1242 
1256  infty_score_t seq_str_matching(const Arc &a, const Arc &b, matpos_t mat_pos_diag,
1257  pair_seqpos_t seq_pos_to_be_matched, score_t add_score,bool matrixLR,
1258  bool suboptimal);
1259 
1261  void compute_F();
1262 
1263  // --------------------------------------------
1264  // helper functions
1265 
1271  score_t score_for_seq_match();
1272 
1281  infty_score_t score_for_am(const Arc &a, const Arc &b) const;
1282 
1294  score_t score_for_stacking(const Arc &a, const Arc &b,
1295  const Arc &inner_a,const Arc &inner_b);
1296 
1304  void add_foundEPM(EPM &cur_epm, bool count_EPMs);
1305 
1306  bool check_PPM(){
1307  if(this->difference_to_opt_score != -1) return true; //when we use the parameter difference_to_opt_score,
1308  //we enumerate all EPMs regardless of whether the number extends the max_number_of_EPMs
1309  if(cur_number_of_EPMs>=max_number_of_EPMs+1) return false;
1310  else return true;
1311  }
1312 
1320  void find_start_pos_for_tb(bool suboptimal, score_t difference_to_opt_score=-1, bool count_EPMs=false);
1321 
1322  bool check_num_EPMs(){
1323  double valid_deviation = 0.8;
1324  return (cur_number_of_EPMs>=max_number_of_EPMs*valid_deviation && cur_number_of_EPMs<=max_number_of_EPMs);
1325  }
1326 
1327 
1328  // --------------------------------------------
1329  // heuristic traceback
1330 
1339  void trace_F_heuristic(pos_type i, pos_type j,EPM &cur_epm);
1340 
1349  void trace_LGLR_heuristic(const Arc &a, const Arc &b, EPM &cur_epm);
1350 
1365  bool trace_seq_str_matching_heuristic(const Arc &a, const Arc &b,int &state,
1366  matpos_t &cur_mat_pos, matpos_t mat_pos_diag,
1367  pair_seqpos_t seq_pos_to_be_matched, score_t add_score);
1368 
1369 
1370  // --------------------------------------------
1371  // suboptimal traceback
1372 
1384  void trace_F_suboptimal(pos_type i,pos_type j,score_t max_tol, bool recurse, bool count_EPMs);
1385 
1393  void apply_filter(epm_cont_t &found_epms);
1394 
1407  void trace_LGLR_suboptimal(const Arc &a, const Arc &b, score_t max_tol,
1408  epm_cont_t &found_epms, bool recurse, bool count_EPMs);
1409 
1410 
1430  void trace_seq_str_matching_subopt(const Arc &a, const Arc &b,
1431  score_t score_contr, matpos_t mat_pos_diag, pair_seqpos_t seq_pos_to_be_matched,
1432  const PairArcIdx &am, poss_L_LR &poss, epm_it_t cur_epm, epm_cont_t &found_epms,
1433  map_am_to_do_t &map_am_to_do, bool count_EPMs);
1434 
1451  bool check_poss(const Arc &a, const Arc &b, const poss_L_LR &pot_new_poss,
1452  poss_L_LR &poss, epm_it_t cur_epm, epm_cont_t &found_epms,
1453  map_am_to_do_t &am_to_do_for_cur_am, bool count_EPMs);
1454 
1455 
1472  void store_new_poss(const Arc &a, const Arc &b, bool last_poss,
1473  const poss_L_LR &new_poss, poss_L_LR &poss, epm_it_t cur_epm,
1474  epm_cont_t &found_epms, map_am_to_do_t &am_to_do_for_cur_am, bool count_EPMs);
1475 
1491  void trace_G_suboptimal(const Arc &a, const Arc &b, const poss_L_LR &pot_new_poss,
1492  poss_L_LR &poss, epm_it_t cur_epm, epm_cont_t &found_epms,
1493  map_am_to_do_t &map_am_to_do, bool count_EPMs);
1494 
1505  bool is_valid_gap(const Arc &a, const Arc &b,
1506  const poss_L_LR &pot_new_poss);
1507 
1523  void preproc_fill_epm(map_am_to_do_t &am_to_do, epm_it_t cur_epm,
1524  epm_cont_t &found_epms, bool count_EPMs, score_t min_allowed_score=-1);
1525 
1544  void fill_epm(const map_am_to_do_t &map_am_to_do, size_type vec_idx,
1545  std::vector<score_t> &max_tol_left_up_to_pos, std::vector<const EPM*> &epms_to_insert,
1546  score_t min_score, epm_it_t cur_epm, epm_cont_t &found_epms,bool count_EPMs);
1547 
1548  // --------------------------------------------
1549  // debugging/testing
1550 
1551  // print the matrices in the condensed form
1552  void print_matrices(const Arc &a, const Arc &b, size_type offset_A, size_type offset_B, bool suboptimal,bool add_info);
1553 
1554  // checks whether an epm is valid, i.e. only one gap per arc match etc.
1555  bool validate_epm(const EPM &epm_to_test) const;
1556 
1557  // checks the validity of the epm list, i.e. that no epm is contained in another epm (all
1558  // epms are maximally extended)
1559  bool validate_epm_list(epm_cont_t &found_epms) const;
1560 
1561  public:
1562 
1563 
1586  ExactMatcher(const Sequence &seqA_,
1587  const Sequence &seqB_,
1588  const RnaData &rna_dataA_,
1589  const RnaData &rna_dataB_,
1590  const ArcMatches &arc_matches_,
1591  const SparseTraceController &sparse_trace_controller_,
1592  PatternPairMap &foundEPMs_,
1593  int alpha_1_,
1594  int alpha_2_,
1595  int alpha_3_,
1596  score_t difference_to_opt_score_,
1597  score_t min_score_,
1598  long int max_number_of_EPMs_,
1599  bool inexact_struct_match_,
1600  score_t struct_mismatch_score_,
1601  bool apply_filter_,
1602  bool opt_verbose_
1603  );
1604 
1605  ~ExactMatcher();
1606 
1609  void compute_arcmatch_score();
1610 
1612  void test_arcmatch_score();
1613 
1620  void trace_EPMs(bool suboptimal);
1621 
1622 
1623  };
1624 
1625 
1626 } //end namespace
1627 
1628 #endif // EXACT_MATCHER_HH
manage a set of EPMs (PatternPair)
Definition: exact_matcher.hh:209
represent sparsified data of RNA ensemble
Definition: rna_data.hh:42
matidx_t idx_after_leq(index_t index, seq_pos_t max_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:299
bool is_invalid() const
Definition: exact_matcher.hh:752
size_t left() const
Definition: basepairs.hh:72
score_t get_score() const
returns the score of the EPM
Definition: exact_matcher.hh:736
bool includes_am(const EPM &epm_to_test) const
Definition: exact_matcher.hh:955
const SparsificationMapper & get_sparse_mapperA() const
destructor
Definition: exact_matcher.hh:490
combines the TraceController with the Mapper for both sequences
Definition: exact_matcher.hh:453
size_t right() const
Definition: basepairs.hh:80
const int getScore() const
Definition: exact_matcher.hh:179
std::ostream & operator<<(std::ostream &out, AlignerRestriction r)
Definition: aligner_restriction.hh:113
TaintedInftyInt max(const TaintedInftyInt &x, const TaintedInftyInt &y)
Definition: infty_int.hh:617
bool includes(const EPM &epm_to_test) const
Definition: exact_matcher.hh:942
SparseTraceController::matpos_t matpos_t
a type for a position in a sparsified matrix
Definition: exact_matcher.hh:667
const int & getSize() const
Definition: exact_matcher.hh:131
EPM()
Constructor.
Definition: exact_matcher.hh:720
SparsificationMapper::ArcIdx ArcIdx
arc index
Definition: exact_matcher.hh:668
const int getEPMScore() const
Definition: exact_matcher.hh:185
SinglePattern(const std::string &myId_, const std::string &seqId_, const intVec &mySinglePattern_)
constructor
Definition: exact_matcher.hh:55
stores a Pattern in one sequence
Definition: exact_matcher.hh:44
void add_am(const Arc &a, const Arc &b)
Definition: exact_matcher.hh:889
const intVec & getPat() const
Definition: exact_matcher.hh:80
bool is_valid_idx_pos(index_t idxA, index_t idxB, matpos_t mat_pos) const
checks whether a matrix position is valid
Definition: exact_matcher.hh:649
matidx_t min_col_idx(index_t indexA, index_t indexB, matidx_t idx_i, index_t left_endB=std::numeric_limits< index_t >::max()) const
minimal column of trace in a row in the sparsified matrix
Definition: exact_matcher.hh:509
orderedMapTYPE::iterator orderedMapITER
iterator for the map
Definition: exact_matcher.hh:217
size_type pos_type
type of a sequence position
Definition: aux.hh:97
pos_type seq_pos_t
type for a sequence position
Definition: sparsification_mapper.hh:41
const SinglePattern & getFirstPat() const
Definition: exact_matcher.hh:137
pat_vec_t::size_type pat_vec_size() const
Definition: exact_matcher.hh:840
const Arc & arcA() const
Definition: arc_matches.hh:66
void set_score(score_t score_)
Definition: exact_matcher.hh:764
const score_t & get_max_tol_left() const
returns the maximal tolerance that is left for the EPM
Definition: exact_matcher.hh:745
size_t size_type
general size type
Definition: aux.hh:94
pair_seqpos_t pos_in_seq(index_t idxA, index_t idxB, const matpos_t &cur_pos) const
maps the matrix position cur_pos to the corresponding pair of positions in sequence A and B ...
Definition: exact_matcher.hh:606
std::pair< matidx_t, matidx_t > matpos_t
a type for a position in a sparsified matrix
Definition: exact_matcher.hh:463
std::pair< ArcIdx, ArcIdx > PairArcIdx
pair of arc indices
Definition: exact_matcher.hh:670
SparsificationMapper::seq_pos_t seqpos_t
a type for a sequence position
Definition: exact_matcher.hh:666
const std::vector< intPPair > & getInsideBounds() const
Definition: exact_matcher.hh:167
const matpos_t & get_cur_pos() const
returns the current matrix position of the EPM
Definition: exact_matcher.hh:742
pair_seqpos_t last_matched_pos()
Definition: exact_matcher.hh:859
virtual ~SinglePattern()
Destructor.
Definition: exact_matcher.hh:62
BasePairs__Arc Arc
arc class of BasePairs
Definition: exact_matcher.hh:664
unordered_map< std::string, SelfValuePTR >::type PatternIdMapTYPE
map type patternId -> pointer to PatternPair
Definition: exact_matcher.hh:221
bool get_first_insertion() const
returns whether it is the first insertion into the EPM
Definition: exact_matcher.hh:748
Maintains the relevant arc matches and their scores.
Definition: arc_matches.hh:112
std::list< SelfValuePTR > patListTYPE
list of patternPairs
Definition: exact_matcher.hh:218
void insert_epm(const EPM &epm_to_insert)
Definition: exact_matcher.hh:932
std::vector< PairArcIdx > PairArcIdxVec
a vector of pairs of arc indices
Definition: exact_matcher.hh:671
orderedMapTYPE::const_iterator orderedMapCITER
const iterator for the map
Definition: exact_matcher.hh:216
std::vector< ArcIdx > ArcIdxVec
vector of arc indices
Definition: sparsification_mapper.hh:39
Definition: aligner.cc:17
const int getMinPatternSize() const
Definition: exact_matcher.hh:318
PairArcIdxVec::const_iterator am_end() const
Definition: exact_matcher.hh:824
size_t index_t
type for an index
Definition: sparsification_mapper.hh:49
const std::string & getId() const
Definition: exact_matcher.hh:125
patListTYPE::iterator patListITER
iterator for the list of PatternPairs
Definition: exact_matcher.hh:219
Represents a 3-tuple.
Definition: tuples.hh:17
bool pos_unpaired(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:226
void overwrite(seqpos_t posA, seqpos_t posB, char c, pat_vec_t::size_type pos)
Definition: exact_matcher.hh:879
std::pair< seqpos_t, seqpos_t > pair_seqpos_t
a type for a pair of positions in the sequences
Definition: exact_matcher.hh:464
SparseTraceController::pair_seqpos_t pair_seqpos_t
pair of positions in sequence A and B
Definition: exact_matcher.hh:669
pat_vec_t::const_iterator begin() const
Definition: exact_matcher.hh:846
Represents the mapping for sparsification.
Definition: sparsification_mapper.hh:34
seq_pos_t get_pos_in_seq_new(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:205
virtual ~EPM()
destructor
Definition: exact_matcher.hh:729
is able to manage an EPM, consists of 2 singlepatterns, one in each RNA
Definition: exact_matcher.hh:92
PatternPair(const std::string &myId, const SinglePattern &myFirstPat, const SinglePattern &mySecPat, const std::string &structure_, int &score_)
Constructor.
Definition: exact_matcher.hh:105
const intPPair getOutsideBounds() const
Definition: exact_matcher.hh:158
void set_state(int state_)
Definition: exact_matcher.hh:770
Represents a 5-tuple.
Definition: tuples.hh:64
Definition: infty_int.hh:344
Computes exact pattern matchings (EPM) between two RNA sequences.
Definition: exact_matcher.hh:1048
pos_type matidx_t
type for a matrix position
Definition: sparsification_mapper.hh:40
bool matching_wo_gap(index_t idxA, index_t idxB, matpos_t idx_pos_diag, pair_seqpos_t seq_pos_to_be_matched) const
is a EPM without a gap in between possible
Definition: exact_matcher.hh:624
const Arc & arcB() const
Definition: arc_matches.hh:74
pos_type length() const
Length of multiple aligment.
Definition: multiple_alignment.hh:624
void set_first_insertion(bool first_insertion_)
Definition: exact_matcher.hh:788
PairArcIdx next_arcmatch()
Definition: exact_matcher.hh:909
const std::string & getmyId() const
Definition: exact_matcher.hh:68
std::vector< el_pat_vec > pat_vec_t
type for pattern vector
Definition: exact_matcher.hh:676
el_pat_vec pat_vec_at(pat_vec_t::size_type idx) const
Definition: exact_matcher.hh:831
T3 third
third value
Definition: tuples.hh:19
virtual ~PatternPair()
Destructor.
Definition: exact_matcher.hh:116
void add(seqpos_t posA, seqpos_t posB, char c)
Definition: exact_matcher.hh:870
patListTYPE::const_iterator patListCITER
const iterator for the list of PatternPairs
Definition: exact_matcher.hh:220
const SinglePattern & getSecPat() const
Definition: exact_matcher.hh:143
T1 max4(const T1 &first, const T1 &second, const T1 &third, const T1 &fourth)
Definition: exact_matcher.hh:1033
matpos_t diag_pos_bef(index_t indexA, index_t indexB, pair_seqpos_t cur_pos_seq, index_t left_endA=std::numeric_limits< index_t >::max(), index_t left_endB=std::numeric_limits< index_t >::max()) const
computes the first valid matrix position before a sequence position considering the trace controller ...
Definition: exact_matcher.hh:544
const SparsificationMapper & get_sparse_mapperB() const
returns reference to sparsification mapper for sequence B
Definition: exact_matcher.hh:495
void set_max_tol_left(score_t tol)
Definition: exact_matcher.hh:782
void store_am(const Arc &a, const Arc &b)
Definition: exact_matcher.hh:899
bool operator<(const BasePairs::LeftAdjEntry &e1, const BasePairs::LeftAdjEntry &e2)
Definition: basepairs.cc:83
T1 max3(const T1 &first, const T1 &second, const T1 &third)
Definition: exact_matcher.hh:1020
pat_vec_t::const_iterator end() const
Definition: exact_matcher.hh:852
const PairArcIdx & get_am(PairArcIdxVec::size_type idx) const
Definition: exact_matcher.hh:800
void print_epm(std::ostream &out, bool verbose) const
Definition: exact_matcher.hh:966
bool pos_unpaired(index_t idxA, index_t idxB, matpos_t pos) const
checks whether the matrix position pos can be unpaired in both sequences
Definition: exact_matcher.hh:636
PatternPairMap(const PatternPairMap &myPairMap)
Definition: exact_matcher.hh:229
Represents a base pair.
Definition: basepairs.hh:40
Controls the matrix cells valid for traces.
Definition: trace_controller.hh:177
computes the best chain of EPMs, the LCS-EPM
Definition: exact_matcher.hh:339
const std::string & getseqId() const
Definition: exact_matcher.hh:74
size_t idx() const
Definition: basepairs.hh:88
matidx_t idx_after_max_col_idx(index_t indexA, index_t indexB, matidx_t idx_i, index_t left_endB=std::numeric_limits< index_t >::max()) const
index after maximal column of trace in a row in the sparsified matrix
Definition: exact_matcher.hh:525
size_t ArcIdx
type of arc index
Definition: sparsification_mapper.hh:38
Describes sequence and structure ensemble of an RNA.
Definition: basepairs.hh:107
triple< seqpos_t, seqpos_t, char > el_pat_vec
type for elements of the pattern vector, <position in seq A, position in sequence B...
Definition: exact_matcher.hh:674
LCSEPM(const Sequence &seqA_, const Sequence &seqB_, const PatternPairMap &myPatterns, PatternPairMap &myLCSEPM)
Definition: exact_matcher.hh:350
Definition: aux.hh:51
void set_invalid()
sets the flag invalid for the EPM
Definition: exact_matcher.hh:791
PatternPair selfValueTYPE
PatternPair.
Definition: exact_matcher.hh:212
long int score_t
type of the locarna score as defined by the class Scoring
Definition: scoring_fwd.hh:13
PatternPair * SelfValuePTR
pointer to PatternPair
Definition: exact_matcher.hh:213
int get_state() const
return the current matrix state of the EPM
Definition: exact_matcher.hh:739
const std::string & get_struct() const
Definition: exact_matcher.hh:191
PairArcIdxVec::size_type number_of_am()
Definition: exact_matcher.hh:809
void set_cur_pos(matpos_t cur_pos_)
Definition: exact_matcher.hh:776
"Sequence View" of multiple alignment as array of column vectors
Definition: sequence.hh:29
Represents a match of two base pairs (arc match)
Definition: arc_matches.hh:35
PairArcIdxVec::const_iterator am_begin() const
Definition: exact_matcher.hh:818
a class for the representation of exact pattern matches (EPM)
Definition: exact_matcher.hh:661
std::multimap< int, SelfValuePTR, std::greater< int > > orderedMapTYPE
ordered map type
Definition: exact_matcher.hh:215
void clear_am_to_do()
deletes the list am_to_do
Definition: exact_matcher.hh:812
void sort_am_to_do()
Definition: exact_matcher.hh:924
matidx_t first_valid_mat_pos_before(index_t index, seq_pos_t pos, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:191
matidx_t idx_geq(index_t index, seq_pos_t min_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:268
void sort_patVec()
Definition: exact_matcher.hh:919
SparseTraceController(const SparsificationMapper &sparse_mapperA_, const SparsificationMapper &sparse_mapperB_, const TraceController &trace_controller_)
constructor
Definition: exact_matcher.hh:478