LocARNA-1.8.11
multiple_alignment.hh
1 #ifndef LOCARNA_MULTIPLE_ALIGNMENT_HH
2 #define LOCARNA_MULTIPLE_ALIGNMENT_HH
3 
4 #ifdef HAVE_CONFIG_H
5 # include <config.h>
6 #endif
7 
8 #include <iosfwd>
9 #include <string>
10 #include <vector>
11 #include <map>
12 
13 #include "aux.hh"
14 #include "string1.hh"
15 #include "scoring_fwd.hh"
16 #include "sequence_annotation.hh"
17 
18 #include <assert.h>
19 
20 #include <iostream>
21 
22 
23 namespace LocARNA {
24 
25  class Alignment;
26  class AlignmentEdges;
27  template<class T> class Alphabet;
28  class BasePairs;
29  class Scoring;
30  class Sequence;
31 
66 
67 public:
68  typedef size_t size_type;
69 
73  struct FormatType {
75  enum type {
77  PP,
80  };
81 
83  static size_t
84  size() {return 4;}
85  };
86 
89  struct AnnoType {
91  enum type {
101  anchors
102  };
103 
105  static size_t
106  size() {return 4;}
107  };
108 
109 private:
114  typedef std::vector<
115  std::vector<std::string>
116  > annotation_tags_t;
117 
118  static
119  annotation_tags_t annotation_tags;
120 
122  static
123  void
124  init_annotation_tags();
125 
126 
127 public:
130  static
131  size_t
133  return annotation_tags.size();
134  }
135 
144  class SeqEntry {
145  public:
147 
148  typedef std::pair<pos_type,pos_type> pos_pair_t;
149 
150  private:
151  std::string name_;
152  std::string description_;
153  string1 seq_; //<! alignment string of the sequence
154 
155  public:
156 
164  SeqEntry(const std::string &name,
165  const std::string &seq)
166  : name_(name), description_(""), seq_((string1)seq)
167  {}
168 
176  SeqEntry(const std::string &name, const string1 &seq)
177  : name_(name), description_(""), seq_(seq)
178  {}
179 
187  SeqEntry(const std::string &name,
188  const std::string &description,
189  const std::string &seq)
190  : name_(name), description_(description), seq_((string1)seq)
191  {}
192 
200  SeqEntry(const std::string &name,
201  const std::string &description,
202  const string1 &seq)
203  : name_(name), description_(description), seq_(seq)
204  {}
205 
206  // access
207 
209  const std::string &
210  name() const {return name_;}
211 
213  const std::string &
214  description() const {return description_;}
215 
217  const string1 &
218  seq() const {return seq_;}
219 
221  size_type
222  length_wogaps() const;
223 
224  //****************************************
225  // projections
226 
234  pos_type
235  pos_to_col(pos_type pos) const;
236 
245  pos_pair_t
246  col_to_pos(pos_type col) const;
247 
252  void
254  seq_.reverse();
255  }
256 
261  void
262  push_back(char c) {
263  seq_.push_back(c);
264  }
265 
267  void
268  set_seq(const string1 &seq) {seq_=seq;}
269  };
270 
276  class AliColumn {
277  const MultipleAlignment &ma_;
278  size_type col_index_;
279  public:
286  AliColumn(const MultipleAlignment &ma,size_type col_index): ma_(ma),col_index_(col_index) {
287  assert(1<=col_index);
288  assert(col_index<=ma.length());
289  }
290 
298  const char &
299  operator [](size_type row_index) const {
300  return ma_.seqentry(row_index).seq()[col_index_];
301  }
302 
307  size_type
308  size() const {
309  return ma_.num_of_rows();
310  }
311 
319  bool
320  operator ==(const AliColumn &ac) const {
321  bool ret = this->size()==ac.size();
322  for (size_type i=0; ret && i<size(); i++) {
323  ret = ( this->ma_.seqentry(i).seq()[this->col_index_]
324  ==
325  ac.ma_.seqentry(i).seq()[ac.col_index_] );
326  }
327  return ret;
328  }
329 
337  bool
338  operator !=(const AliColumn &ac) const {
339  return !(*this == ac);
340  }
341 
342  };
343 
344 private:
345 
347  typedef std::map<std::string,size_type> str2idx_map_t;
348 
350  typedef std::map<size_t,SequenceAnnotation> annotation_map_t;
351 
352  //************************************************************
353  // attributes of MultipleAlignment
354 
356  std::vector<SeqEntry> alig_;
357 
359  annotation_map_t annotations_;
360 
365  str2idx_map_t name2idx_;
366 
367  // end attributes
368  //************************************************************
369 
371  void
372  create_name2idx_map();
373 
374 
383  void
384  read_clustallike(std::istream &in, FormatType::type format);
385 
394  void
395  read_stockholm(std::istream &in);
396 
408  void
409  read_clustalw(std::istream &in);
410 
429  void
430  read_fasta(std::istream &in);
431 
432 public:
433 
435  typedef std::vector<SeqEntry>::const_iterator const_iterator;
436 
439 
448  MultipleAlignment(const std::string &file, FormatType::type format=FormatType::CLUSTAL);
449 
457  MultipleAlignment(std::istream &in, FormatType::type format=FormatType::CLUSTAL);
458 
464  MultipleAlignment(const std::string &name,
465  const std::string &sequence);
466 
476  MultipleAlignment(const std::string &nameA,
477  const std::string &nameB,
478  const std::string &alistringA,
479  const std::string &alistringB);
480 
494  MultipleAlignment(const Alignment &alignment,
495  bool only_local=false,
496  bool special_gap_symbols=false);
497 
509  MultipleAlignment(const AlignmentEdges &edges,
510  const Sequence &seqA,
511  const Sequence &seqB);
512 
513 protected:
522  void
523  init(const AlignmentEdges &edges,
524  const Sequence &seqA,
525  const Sequence &seqB,
526  bool special_gap_symbols);
527 public:
528 
532  virtual
534 
541  const Sequence & as_sequence() const;
542 
550  void
552 
557  size_type
558  num_of_rows() const {
559  return alig_.size();
560  }
561 
570  bool
571  empty() const {
572  return alig_.empty();
573  }
574 
581  const SequenceAnnotation &
582  annotation(const AnnoType::type &annotype) const;
583 
591  void
593  const SequenceAnnotation &annotation) {
594  assert(0<=annotype && annotype<num_of_annotypes());
595  annotations_[(size_t)annotype] = annotation;
596  }
597 
603  bool
604  has_annotation(const AnnoType::type &annotype) const {
605  assert(0<=annotype && annotype<num_of_annotypes());
606  return annotations_.find(annotype)!=annotations_.end();
607  }
608 
613  bool
614  is_proper() const;
615 
623  pos_type
624  length() const { return alig_.empty() ? 0 : alig_[0].seq().length(); }
625 
630  const_iterator
631  begin() const {
632  return alig_.begin();
633  }
634 
639  const_iterator
640  end() const {
641  return alig_.end();
642  }
643 
649  bool
650  contains(std::string name) const;
651 
652  /* index access saves time over access by sequence name */
653 
661  size_type
662  index(const std::string &name) const {
663  str2idx_map_t::const_iterator it = name2idx_.find(name);
664  assert(it!=name2idx_.end());
665  return it->second;
666  }
667 
675  const SeqEntry &
676  seqentry(size_type index) const {
677  return alig_[index];
678  }
679 
686  const SeqEntry &
687  seqentry(const std::string &name) const {
688  return alig_[index(name)];
689  }
690 
691 
702  size_type
703  deviation(const MultipleAlignment &ma) const;
704 
719  double
720  sps(const MultipleAlignment &ma, bool compalign=true) const;
721 
736  double
738 
751  double
752  avg_deviation_score(const MultipleAlignment &ma) const;
753 
754 
763  std::string
764  consensus_sequence() const;
765 
773  AliColumn
774  column(size_type col_index) const {
775  return AliColumn(*this,col_index);
776  }
777 
785  void
786  append(const SeqEntry &seqentry);
787 
798  void
799  prepend(const SeqEntry &seqentry);
800 
806  void
807  operator += (const AliColumn &c);
808 
814  void
815  operator += (char c);
816 
820  void
821  reverse();
822 
823 
824  // ------------------------------------------------------------
825  // output
826 
839  std::ostream &
840  write(std::ostream &out, FormatType::type format=MultipleAlignment::FormatType::CLUSTAL) const;
841 
855  std::ostream &
856  write(std::ostream &out, size_t width, FormatType::type format=MultipleAlignment::FormatType::CLUSTAL) const;
857 
870  std::ostream &
871  write_name_sequence_line(std::ostream &out,
872  const std::string &name,
873  const std::string &sequence,
874  size_t namewidth) const;
875 
889  std::ostream &
890  write(std::ostream &out,
891  size_type start,
892  size_type end,
894 
905  bool
906  checkAlphabet(const Alphabet<char> &alphabet) const;
907 
908 private:
909 
919  static
920  size_type
921  deviation2(const string1 &a1,
922  const string1 &a2,
923  const string1 &ref1,
924  const string1 &ref2
925  );
926 
927 
941  static
942  double
943  pairwise_match_score(const SeqEntry &a1,
944  const SeqEntry &a2,
945  const SeqEntry &ref1,
946  const SeqEntry &ref2,
947  bool score_common_gaps
948  );
949 
960  static
961  std::vector<int>
962  match_vector(const string1 &s,
963  const string1 &t);
964 
975  static
976  std::vector<int>
977  match_vector2(const string1 &s,
978  const string1 &t);
979 
988  static
989  size_t
990  count_matches(const SeqEntry &a1,
991  const SeqEntry &a2);
992 
1005  static
1006  size_t
1007  count_exclusive_matches(const SeqEntry &a1,
1008  const SeqEntry &a2,
1009  const SeqEntry &ref1,
1010  const SeqEntry &ref2
1011  );
1012 
1030  static
1031  double
1032  pairwise_deviation_score(const SeqEntry &a1,
1033  const SeqEntry &a2,
1034  const SeqEntry &ref1,
1035  const SeqEntry &ref2
1036  );
1037 
1038 public:
1039 
1044  void
1045  write_debug(std::ostream &out=std::cout) const;
1046 };
1047 
1054  std::ostream &
1055  operator << (std::ostream &out, const MultipleAlignment &ma);
1056 
1057 } // end namespace
1058 
1059 #endif // LOCARNA_MULTIPLE_ALIGNMENT_HH
void reverse()
reverse string
Definition: string1.hh:117
size_type size() const
Size / Number of rows.
Definition: multiple_alignment.hh:308
AliColumn(const MultipleAlignment &ma, size_type col_index)
Construct from multiple alignment column.
Definition: multiple_alignment.hh:286
void operator+=(const AliColumn &c)
Append a column.
Definition: multiple_alignment.cc:1173
static size_t num_of_annotypes()
number of annotation types
Definition: multiple_alignment.hh:132
std::ostream & operator<<(std::ostream &out, AlignerRestriction r)
Definition: aligner_restriction.hh:113
bool has_annotation(const AnnoType::type &annotype) const
Definition: multiple_alignment.hh:604
size_type num_of_rows() const
Number of rows of multiple aligment.
Definition: multiple_alignment.hh:558
pair of vector of alignment edges
Definition: alignment.hh:63
file format type for multiple alignments
Definition: multiple_alignment.hh:73
void prepend(const SeqEntry &seqentry)
Prepend sequence entry.
Definition: multiple_alignment.cc:1165
void normalize_rna_symbols()
normalize rna symbols
Definition: multiple_alignment.cc:585
bool operator==(const TaintedInftyInt &x, const TaintedInftyInt &y)
Definition: infty_int.hh:598
Definition: multiple_alignment.hh:96
static size_t size()
size of enum
Definition: multiple_alignment.hh:106
const SeqEntry & seqentry(const std::string &name) const
Access name/sequence pair by name.
Definition: multiple_alignment.hh:687
size_type pos_type
type of a sequence position
Definition: aux.hh:97
const std::string & description() const
(read-only) access to description
Definition: multiple_alignment.hh:214
std::string consensus_sequence() const
Consensus sequence of multiple alignment.
Definition: multiple_alignment.cc:990
type
inner type
Definition: multiple_alignment.hh:75
A row in a multiple alignment.
Definition: multiple_alignment.hh:144
bool is_proper() const
Test whether alignment is proper.
Definition: multiple_alignment.cc:608
bool contains(std::string name) const
Test whether name exists.
Definition: multiple_alignment.cc:622
SeqEntry(const std::string &name, const std::string &description, const std::string &seq)
Construct from strings name, description and seq.
Definition: multiple_alignment.hh:187
AliColumn column(size_type col_index) const
Access alignment column.
Definition: multiple_alignment.hh:774
fasta file format
Definition: multiple_alignment.hh:79
virtual ~MultipleAlignment()
virtual destructor
Definition: multiple_alignment.cc:271
void set_seq(const string1 &seq)
write access to seq
Definition: multiple_alignment.hh:268
MultipleAlignment()
Construct empty.
Definition: multiple_alignment.cc:54
(extended) clustal file format
Definition: multiple_alignment.hh:78
double avg_deviation_score(const MultipleAlignment &ma) const
Average deviation score.
Definition: multiple_alignment.cc:745
const SequenceAnnotation & annotation(const AnnoType::type &annotype) const
Read access of annotation by prefix.
Definition: multiple_alignment.cc:595
Definition: aligner.cc:17
bool checkAlphabet(const Alphabet< char > &alphabet) const
check character constraints
Definition: multiple_alignment.cc:1148
void init(const AlignmentEdges &edges, const Sequence &seqA, const Sequence &seqB, bool special_gap_symbols)
Initialize from alignment edges and sequences.
Definition: multiple_alignment.cc:209
size_type index(const std::string &name) const
Access index by name.
Definition: multiple_alignment.hh:662
double cmfinder_realignment_score(const MultipleAlignment &ma) const
Cmfinder realignment score of a multiple alignment to a reference alignment.
Definition: multiple_alignment.cc:829
std::vector< SeqEntry >::const_iterator const_iterator
const iterator of sequence entries
Definition: multiple_alignment.hh:435
void write_debug(std::ostream &out=std::cout) const
Print contents of object to stream.
Definition: multiple_alignment.cc:982
double sps(const MultipleAlignment &ma, bool compalign=true) const
Sum-of-pairs score between a multiple alignment and a reference alignment.
Definition: multiple_alignment.cc:699
type
inner type
Definition: multiple_alignment.hh:91
A simple 1-based string.
Definition: string1.hh:22
size_t size_type
size type
Definition: multiple_alignment.hh:68
size_type deviation(const MultipleAlignment &ma) const
Deviation of a multiple alignment from a reference alignment.
Definition: multiple_alignment.cc:675
std::ostream & write_name_sequence_line(std::ostream &out, const std::string &name, const std::string &sequence, size_t namewidth) const
Write formatted line of name and sequence.
Definition: multiple_alignment.cc:1020
SeqEntry(const std::string &name, const string1 &seq)
Construct from strings name and 1-based string seq.
Definition: multiple_alignment.hh:176
read only proxy class representing a column of the alignment
Definition: multiple_alignment.hh:276
const Sequence & as_sequence() const
"cast" multiple alignment to sequence
Definition: multiple_alignment.cc:274
void set_annotation(const AnnoType::type &annotype, const SequenceAnnotation &annotation)
Write access to annotation.
Definition: multiple_alignment.hh:592
std::ostream & write(std::ostream &out, FormatType::type format=MultipleAlignment::FormatType::CLUSTAL) const
Write alignment to stream.
Definition: multiple_alignment.cc:1134
const_iterator begin() const
Begin for read-only traversal of name/sequence pairs.
Definition: multiple_alignment.hh:631
const string1 & seq() const
(read-only) access to seq
Definition: multiple_alignment.hh:218
Definition: multiple_alignment.hh:99
pos_type length() const
Length of multiple aligment.
Definition: multiple_alignment.hh:624
void append(const SeqEntry &seqentry)
Append sequence entry.
Definition: multiple_alignment.cc:1160
MultipleAlignment::size_type size_type
size type
Definition: multiple_alignment.hh:146
static size_t size()
size of enum
Definition: multiple_alignment.hh:84
const std::string & name() const
(read-only) access to name
Definition: multiple_alignment.hh:210
void reverse()
reverse sequence
Definition: multiple_alignment.hh:253
void reverse()
reverse the multiple alignment
Definition: multiple_alignment.cc:1141
void push_back(char c)
push back character
Definition: string1.hh:127
type of sequence annotation. enumerates legal annotation types
Definition: multiple_alignment.hh:89
Represents a structure-annotated sequence alignment.
Definition: alignment.hh:87
consensus structure annotation (consensus structure)
Definition: multiple_alignment.hh:93
const SeqEntry & seqentry(size_type index) const
Access name/sequence pair by index.
Definition: multiple_alignment.hh:676
Annotation of a sequence.
Definition: sequence_annotation.hh:24
pp format
Definition: multiple_alignment.hh:77
SeqEntry(const std::string &name, const std::string &description, const string1 &seq)
Construct from strings name, description and 1-based string seq.
Definition: multiple_alignment.hh:200
Represents a multiple alignment.
Definition: multiple_alignment.hh:65
bool empty() const
Emptiness check.
Definition: multiple_alignment.hh:571
std::pair< pos_type, pos_type > pos_pair_t
pair of positions
Definition: multiple_alignment.hh:148
stockholm file format
Definition: multiple_alignment.hh:76
"Sequence View" of multiple alignment as array of column vectors
Definition: sequence.hh:29
SeqEntry(const std::string &name, const std::string &seq)
Construct from strings name and seq.
Definition: multiple_alignment.hh:164
void push_back(char c)
append character to sequence
Definition: multiple_alignment.hh:262
const_iterator end() const
End for read-only traversal of name/sequence pairs.
Definition: multiple_alignment.hh:640