@article{pudimat:schukat:2008:ijprai, author = {Rainer Pudimat and Rolf Backofen and Ernst-G\"unter Schukat-Talamazzini}, title = {Fast Feature Subset Selection in Biological Sequence Analysis}, journal = {International Journal of Pattern Recognition and Artificial Intelligence}, year = 2009, volume = 23, number = 2, pages = {191 -- 207}, abstract = {Motivation:Biological research produces a wealth of measured data. Neither it is easy for biologists to postulate hypotheses about the behaviour or structure of the observed entity because the relevant properties measured are not seen in the ocean of measurements. Nor it is easy to design machine learning algorithms to classify or cluster the data items for the same reason. Algorithms for automatically selecting a highly predictive subset of the measured features can help to overcome these difficulties.Results: We present an efficient feature selection strategy which can be applied to arbitrary feature selection problems. The core technique is a new method for estimating the quality of subsets from previously calculated qualities for smaller subsets by minimising the mean standard error of estimated values with an approach common to support vector machines. This method can be integrated in many feature subset search algorithms. We have applied it with sequential search algorithms and have been able to reduce the number of quality calculations for finding accurate feature subsets by about $70\%$. We show these improvements by applying our approach to the problem of finding highly predictive feature subsets for transcription factor binding sites.}, user = {rpudimat} }