LocARNA-1.8.11
sparsification_mapper.hh
1 #ifndef SPARSIFICATION_MAPPER_HH
2 #define SPARSIFICATION_MAPPER_HH
3 
4 #ifdef HAVE_CONFIG_H
5 # include <config.h>
6 #endif
7 
8 #include <iostream>
9 #include <limits>
10 
11 #include "aux.hh"
12 #include "basepairs.hh"
13 #include "ext_rna_data.hh"
14 
15 // use for type safe index_t
16 // #include "type_wrapper.hh"
17 
18 
19 namespace LocARNA {
20 
21  // print a pair
22  template <class T1, class T2>
23  std::ostream& operator << (std::ostream& out, const std::pair<T1,T2>& pair) {
24  return out << "(" << pair.first << "," << pair.second << ") ";
25  }
26 
27 
35 
36 public:
37  typedef BasePairs__Arc Arc;
38  typedef size_t ArcIdx;
39  typedef std::vector<ArcIdx> ArcIdxVec;
40  typedef pos_type matidx_t;
41  typedef pos_type seq_pos_t;
42 
43  // note: the type safe index_t breaks current code, since
44  // casts to size_t need to be explicite (using index_t's val()-method)
45  // //! type-safe index type this is useful to distinguish index type
46  // //! from other types that are defined as unsigned int
47  // typedef type_wrapper<size_t> index_t;
48 
49  typedef size_t index_t;
50 
52  struct info_for_pos{
53  seq_pos_t seq_pos;
54  bool unpaired;
55  ArcIdxVec valid_arcs;
56 
58  void reset(){
59  this->seq_pos=0;
60  this->valid_arcs.clear();
61  this->unpaired=false;
62  }
63  };
64 
65  typedef std::vector<info_for_pos> InfoForPosVec;
66 
67 private:
68 
69  const BasePairs &bps;
70  const ExtRnaData &rnadata;
71  const double prob_unpaired_in_loop_threshold;
72  const double prob_basepair_in_loop_threshold;
73  size_type max_info_vec_size;
74  std::vector<InfoForPosVec> info_valid_seq_pos_vecs;
77 
80  std::vector<std::vector<matidx_t> > valid_mat_pos_vecs_before_eq;
81 
84  std::vector<std::vector<ArcIdxVec > > left_adj_vec;
85 
87  void compute_mapping_idx_arcs();
88 
90  void compute_mapping_idx_left_ends();
91 
94  void iterate_left_adj_list(pos_type cur_left_end,
95  pos_type cur_pos,
96  const Arc *inner_arc,
97  info_for_pos &struct_pos);
98 
99  void valid_pos_external(pos_type cur_pos,const Arc *inner_arc, info_for_pos &struct_pos);
100 
101 
102 
103 public:
115  const ExtRnaData &rnadata_,
116  double prob_unpaired_in_loop_threshold_,
117  double prob_basepair_in_loop_threshold_,
118  bool index_left_ends):
119  bps(bps_),
120  rnadata(rnadata_),
121  prob_unpaired_in_loop_threshold(prob_unpaired_in_loop_threshold_),
122  prob_basepair_in_loop_threshold(prob_basepair_in_loop_threshold_),
123  max_info_vec_size(0)
124  {
125  if(index_left_ends){
126  compute_mapping_idx_left_ends();
127  }
128  else{
129  compute_mapping_idx_arcs();
130  }
131  }
132 
137  {
138  return max_info_vec_size;
139  }
140 
146  const InfoForPosVec &
147  valid_seq_positions(index_t idx) const{
148  return info_valid_seq_pos_vecs.at(idx);
149  }
150 
157  const ArcIdxVec &
158  valid_arcs_right_adj(index_t idx, matidx_t pos) const {
159  return info_valid_seq_pos_vecs.at(idx).at(pos).valid_arcs;
160  }
161 
171  matidx_t
172  first_valid_mat_pos_before_eq(index_t index, seq_pos_t pos,
173  index_t left_end =
174  std::numeric_limits<index_t>::max()) const {
175  if (left_end == std::numeric_limits<index_t>::max())
176  left_end = index;
177  assert (pos >= left_end); //tocheck
178  return valid_mat_pos_vecs_before_eq.at(index).at(pos-left_end);
179  }
180 
190  inline
191  matidx_t first_valid_mat_pos_before(index_t index, seq_pos_t pos,
192  index_t left_end =
193  std::numeric_limits<index_t>::max()) const {
194  // if (left_end == std::numeric_limits<index_t>::max()) assert (pos > index);
195  return first_valid_mat_pos_before_eq(index, pos-1, left_end);
196  }
197 
204  inline
205  seq_pos_t get_pos_in_seq_new(index_t idx, matidx_t pos) const{
206  assert(pos<number_of_valid_mat_pos(idx));
207  return (info_valid_seq_pos_vecs.at(idx).at(pos).seq_pos);//+arc.left();
208  }
209 
215  size_type number_of_valid_mat_pos(index_t idx) const{
216  return info_valid_seq_pos_vecs.at(idx).size();
217  }
218 
226  bool pos_unpaired(index_t idx,matidx_t pos)const{
227  return info_valid_seq_pos_vecs.at(idx).at(pos).unpaired;
228  }
229 
241  bool matching_without_gap_possible(const Arc &arc, seq_pos_t pos)const{
242  const pos_type &mat_pos = first_valid_mat_pos_before(arc.idx(),pos,arc.left());
243  return get_pos_in_seq_new(arc.idx(),mat_pos)==pos-1;
244  }
245 
252  const ArcIdxVec &
253  valid_arcs_left_adj(const Arc &arc, seq_pos_t pos) const{
254  return left_adj_vec.at(arc.idx()).at(pos-arc.left());
255  }
256 
259  }
260 
268  matidx_t idx_geq(index_t index, seq_pos_t min_col, index_t left_end = std::numeric_limits<index_t>::max()) const{
269 
270  if (left_end == std::numeric_limits<index_t>::max())
271  left_end = index;
272 
273  size_t num_pos = number_of_valid_mat_pos(index);
274  seq_pos_t last_mapped_seq_pos = get_pos_in_seq_new(index,num_pos-1);
275 
276  // if the position min_col is smaller than or equal to the left end (first mapped position), return 0
277  if(min_col<=left_end) return 0;
278 
279  // if the position min_col is greater than the last mapped sequence position, the position does not exists
280  // return the number of valid positions
281  if(min_col>last_mapped_seq_pos) return num_pos;
282 
283  // the matrix position after the first valid position before the position min_col (first matrix position that
284  // stores a sequence position that is greater than or equal to min_col-1)
285  matidx_t idx_geq = first_valid_mat_pos_before_eq(index,min_col-1,left_end)+1;
286 
287  assert(get_pos_in_seq_new(index,idx_geq)>=min_col && !(get_pos_in_seq_new(index,idx_geq-1)>=min_col));
288 
289  return idx_geq;
290  }
291 
299  matidx_t idx_after_leq(index_t index, seq_pos_t max_col, index_t left_end = std::numeric_limits<index_t>::max()) const{
300 
301  if (left_end == std::numeric_limits<index_t>::max())
302  left_end = index;
303 
304  size_t num_pos = number_of_valid_mat_pos(index);
305  seq_pos_t last_mapped_seq_pos = get_pos_in_seq_new(index,num_pos-1);
306 
307  // if the position max_col is smaller than the left_end (first mapped position), return 0
308  if(max_col<left_end) return 0;
309 
310  // if the position max_col is greater than or equal to the last mapped sequence position,
311  // return the number of positions
312  if(max_col>=last_mapped_seq_pos) return num_pos;
313 
314  // the matrix position after the first matrix position before or equal the sequence position max_col
315  // (last matrix position that stores a sequence position that is lower than or equal max_col)
316  matidx_t idx_after_leq = first_valid_mat_pos_before_eq(index,max_col,left_end)+1;
317 
318  assert(get_pos_in_seq_new(index,idx_after_leq-1)<=max_col && !(get_pos_in_seq_new(index,idx_after_leq)<=max_col));
319 
320  return idx_after_leq;
321  }
322 
323 private:
324 
333  bool is_valid_arc(const Arc &inner_arc,const Arc &arc)const{
334  assert(inner_arc.left()>arc.left() && inner_arc.right()<arc.right());
335  return rnadata.arc_in_loop_prob(inner_arc.left(),inner_arc.right(),arc.left(),arc.right())>=prob_basepair_in_loop_threshold;
336  }
337 
345  bool is_valid_arc_external(const Arc &inner_arc)const{
346  return rnadata.arc_external_prob(inner_arc.left(),inner_arc.right())>=prob_basepair_in_loop_threshold;
347  }
348 
349 public:
358  bool is_valid_pos(const Arc &arc,seq_pos_t pos) const{
359  assert(arc.left()<pos && pos<arc.right());
360  return rnadata.unpaired_in_loop_prob(pos,arc.left(),arc.right())>=prob_unpaired_in_loop_threshold; //todo: additional variable for external case
361  }
362 
368  bool is_valid_pos_external(seq_pos_t pos) const{
369  return rnadata.unpaired_external_prob(pos)>=prob_unpaired_in_loop_threshold; //todo: additional variable for external case
370  }
371 };
372 
379 template <class T>
380 std::ostream& operator<<(std::ostream& out, const std::vector<T>& vec){
381  for(typename std::vector<T>::const_iterator it = vec.begin();it!=vec.end();it++){
382  out << *it << " ";
383  }
384  return out;
385 }
386 
393 std::ostream &operator << (std::ostream &out, const std::vector<SparsificationMapper::InfoForPosVec> &pos_vecs_);
394 
401 std::ostream &operator << (std::ostream &out, const SparsificationMapper::InfoForPosVec &pos_vec_);
402 
403 } //end namespace
404 
405 #endif // SPARSIFICATION_MAPPER_HH
ArcIdxVec valid_arcs
a vector of arcs with common right end from the sequence position
Definition: sparsification_mapper.hh:55
virtual ~SparsificationMapper()
class destructor
Definition: sparsification_mapper.hh:258
matidx_t idx_after_leq(index_t index, seq_pos_t max_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:299
size_t left() const
Definition: basepairs.hh:72
size_t right() const
Definition: basepairs.hh:80
double arc_external_prob(pos_type i, pos_type j) const
Get base pair in loop probability.
Definition: rna_data.cc:807
std::ostream & operator<<(std::ostream &out, AlignerRestriction r)
Definition: aligner_restriction.hh:113
SparsificationMapper(const BasePairs &bps_, const ExtRnaData &rnadata_, double prob_unpaired_in_loop_threshold_, double prob_basepair_in_loop_threshold_, bool index_left_ends)
Definition: sparsification_mapper.hh:114
size_type pos_type
type of a sequence position
Definition: aux.hh:97
pos_type seq_pos_t
type for a sequence position
Definition: sparsification_mapper.hh:41
represent sparsified data of RNA ensemble extended by in loop probabilities
Definition: ext_rna_data.hh:34
size_t size_type
general size type
Definition: aux.hh:94
size_type get_max_info_vec_size() const
Definition: sparsification_mapper.hh:136
const InfoForPosVec & valid_seq_positions(index_t idx) const
Definition: sparsification_mapper.hh:147
double arc_in_loop_prob(pos_type i, pos_type j, pos_type p, pos_type q) const
Get base pair in loop probability.
Definition: rna_data.cc:801
bool matching_without_gap_possible(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:241
std::vector< ArcIdx > ArcIdxVec
vector of arc indices
Definition: sparsification_mapper.hh:39
Definition: aligner.cc:17
bool unpaired
if true, the sequence position can occur unpaired
Definition: sparsification_mapper.hh:54
size_t index_t
type for an index
Definition: sparsification_mapper.hh:49
const ArcIdxVec & valid_arcs_right_adj(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:158
matidx_t first_valid_mat_pos_before_eq(index_t index, seq_pos_t pos, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:172
BasePairs__Arc Arc
type of arc
Definition: sparsification_mapper.hh:37
double unpaired_external_prob(pos_type k) const
Get base pair in loop probability.
Definition: rna_data.cc:824
bool pos_unpaired(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:226
Represents the mapping for sparsification.
Definition: sparsification_mapper.hh:34
seq_pos_t get_pos_in_seq_new(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:205
size_type number_of_valid_mat_pos(index_t idx) const
Definition: sparsification_mapper.hh:215
pos_type matidx_t
type for a matrix position
Definition: sparsification_mapper.hh:40
const ArcIdxVec & valid_arcs_left_adj(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:253
void reset()
resets the content of the struct
Definition: sparsification_mapper.hh:58
a struct to represent all necessary information for all valid sequence positions
Definition: sparsification_mapper.hh:52
Represents a base pair.
Definition: basepairs.hh:40
std::vector< info_for_pos > InfoForPosVec
vector of struct info_for_pos that is assigned to the index (either common left end or arc index) ...
Definition: sparsification_mapper.hh:65
size_t idx() const
Definition: basepairs.hh:88
bool is_valid_pos_external(seq_pos_t pos) const
Definition: sparsification_mapper.hh:368
size_t ArcIdx
type of arc index
Definition: sparsification_mapper.hh:38
Describes sequence and structure ensemble of an RNA.
Definition: basepairs.hh:107
bool is_valid_pos(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:358
seq_pos_t seq_pos
the sequence position
Definition: sparsification_mapper.hh:53
matidx_t first_valid_mat_pos_before(index_t index, seq_pos_t pos, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:191
matidx_t idx_geq(index_t index, seq_pos_t min_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:268
double unpaired_in_loop_prob(pos_type k, pos_type p, pos_type q) const
Get base pair in loop probability.
Definition: rna_data.cc:818