@Article{Rose:Hiller:Schutt:Compu_disco_human:2011,
  author =	 {Rose, Dominic and Hiller, Michael and Schutt, Katharina and 
                  Hackermuller, Jorg and Backofen, Rolf and Stadler, Peter F.},
  title =	 {Computational discovery of human coding and non-coding 
                  transcripts with conserved splice sites},
  journal =	 {Bioinformatics},
  year =	 {2011},
  volume =	 {27},
  number =	 {14},
  pages =	 {1894-900},
  user =	 {dominic},
  pmid =	 {21622663},
  doi = 	 {10.1093/bioinformatics/btr314},
  issn = 	 {1367-4811},
  issn = 	 {1367-4803},
  abstract =	 {MOTIVATION: Long non-coding RNAs (lncRNAs) resemble 
                  protein-coding mRNAs but do not encode proteins. Most 
                  lncRNAs are under lower sequence constraints than 
                  protein-coding genes and lack conserved secondary 
                  structures, making it hard to predict them computationally. 
                  RESULTS: We introduce an approach to predict spliced lncRNAs 
                  in vertebrate genomes combining comparative genomics and 
                  machine learning. It is based on detecting signatures of 
                  characteristic splice site evolution in vertebrate whole 
                  genome alignments. First, we predict individual splice 
                  sites, then assemble compatible sites into exon candidates, 
                  and finally predict multi-exon transcripts. Using a novel 
                  method to evaluate typical splice site substitution patterns 
                  that explicitly takes the species phylogeny into account, we 
                  show that individual splice sites can be accurately 
                  predicted. Since our approach relies only on predicted 
                  splice sites, it can uncover both coding and non-coding 
                  exons. We show that our predicted exons and partial 
                  transcripts are mostly non-coding and lack conserved 
                  secondary structures. These exons are of particular 
                  interest, since existing computational approaches cannot 
                  detect them. Transcriptome sequencing data indicate 
                  tissue-specific expression patterns of predicted exons and 
                  there is evidence that increasing sequencing depth and 
                  breadth will validate additional predictions. We also found 
                  a significant enrichment of predicted exons that form 
                  multi-exon transcript parts, and we experimentally validate 
                  such a novel multi-exon gene. Overall, we obtain 336 novel 
                  multi-exon transcript predictions from human intergenic 
                  regions. Our results indicate the existence of novel human 
                  transcripts that are conserved in evolution and our approach 
                  contributes to the completion of the human transcript 
                  catalog. AVAILABILITY AND IMPLEMENTATION: Predicted human 
                  splice sites, exons and gene structures together with a Perl 
                  implementation of the tree-based log-odds scoring and a 
                  supplementary PDF file containing additional figures and 
                  tables are available at: 
                  http://www.bioinf.uni-leipzig.de/publications/supplements/10-010. 
                  The five experimentally confirmed partial transcript 
                  isoforms have been deposited in GenBank under accession 
                  numbers HM587422-HM587426.}
}