@PhdThesis{Kundu_PhDThesis-2015, author = {Kousik Kundu}, title = {In Silico Prediction of Modular Domain-Peptide Interactions}, school = {University of Freiburg}, year = {2015}, month= {April}, abstract = { Protein-protein interactions (PPIs) are one of the most essential cellular processes in eukaryotes that control many important biological activities, such as signal transduction, differentiation, growth, cell polarity, apoptosis etc. Many PPIs in cellular signaling are mediated by modular protein domains. Peptide recognition modules (PRMs) are an important subclass of modular protein domains that specifically recognize short linear peptides to facilitate their biological functions. Hence, it is important to understand the intriguing mechanisms by which hundreds of modular domains specifically bind to their target peptides in a complex cellular environment. In recent years, an unprecedented progress has been made in high-throughput technologies to describe the binding specificities of a number of modular protein domain families. Therefore, given the high binding specificity of PRMs, in silico prediction of their cognate partners is of great interest. In the first part of this thesis, we describe the main high-throughput technologies (microarray, phage display etc.) that are widely used for defining the binding specificity of PRMs. Currently, several computational methods have been published for the prediction of domain-peptide interactions. Here, we provide a comprehensive review on these methods and their applications. We also describe the major drawbacks (e.g., linearity problem, peptide alignment problem, data-imbalance problem etc.) of these existing tools that are successfully addressed in our study. In the second part of this thesis, we present three methods for predicting domain-peptide interactions mediated by three diverse PRM families (i.e., SH2, SH3, and PDZ domain). In order to circumvent the linearity problem, our methods use efficient kernel functions, which exploit higher-order dependencies between amino acid positions. For the prediction of SH2-peptide interactions, polynomial kernels are used to train the classifiers. In addition, we show how to handle the data-imbalance problem by using an efficient semi-supervised technique. For the prediction of SH3-peptide interactions, graph kernels are used for training the classifiers. Graph kernel feature representation allows us to include the physico-chemical properties of each amino acid in the peptides, which increases the generalization capacity of the classifier. By using this kernel function, we were able to eliminate the need of an initial peptide alignment, since the alignment of proline-rich peptides targeted by SH3 domains is a hard task and an error-prone alignment can severely affect the predictive performance of the classifier. Moreover, we developed a generative approach for refining the confidence negative data. In the case of PDZ-peptide interactions, we cluster hundreds of PDZ domains from different organisms, i.e., human, mouse, fly, and worm, based on their binding specificity, and build a single comprehensive model for a set of multiple PDZ domains. In this way, we show that the domain coverage can be increased by using an accurate clustering technique. For training the classifier, a Gaussian kernel function is used. Similar to SH2-peptide interactions, a semi-supervised technique was applied to generate high-confidence negative data. In the third part of this thesis, we describe the applications and performance evaluations of our methods. We compared our methods with several other existing tools and achieved a much higher performance, which was measured by sensitivity, specificity, precision, AUC PR, and AUC ROC. Our methods were further evaluated on various experimentally verified datasets and as a predictive result, they outperformed the state-of-the-art approaches. To uncover the novel and biologically relevant interactions, we performed a genome-wide prediction. Furthermore, a term-centric enrichment analysis has been performed to unveil the novel functionalities of the predicted interactions. In the last part of this thesis, we introduce a new and efficient web server, which contains three tools (i.e., SH2PepInt, SH3PepInt, and PDZPepInt), for the prediction of modular domain-peptide interactions. Currently, we offer 51 and 69 single domain models for SH2 and SH3 domains, respectively, and 43 multiple domain models, which cover 227 domains, for PDZ domains across several organisms. In summary, this thesis presents machine learning methods for predicting the binding peptides of three diverse PRM families where the training data was derived from various high-throughput experiments. Most importantly, this thesis addresses the major computational challenges in the field of modular domain-peptide interactions. We offer the largest set of models to date for the prediction of modular domain mediated interactions. }, user = {kousik} }