@Article{Rebholz-Schuhmann_Kafkas_Kim-Evalu_gold_stand-2013,
  author =	 {Rebholz-Schuhmann, Dietrich and Kafkas, Senay and Kim, 
                  Jee-Hyub and Li, Chen and Jimeno Yepes, Antonio and 
                  Hoehndorf, Robert and Backofen, Rolf and Lewin, Ian},
  title =	 {Evaluating gold standard corpora against gene/protein 
                  tagging solutions and lexical resources},
  journal =	 {J Biomed Semantics},
  year =	 {2013},
  volume =	 {4},
  number =	 {1},
  pages =	 {28},
  user =	 {backofen},
  pmid =	 {24112383},
  doi = 	 {10.1186/2041-1480-4-28},
  issn = 	 {2041-1480},
  abstract =	 {MOTIVATION: The identification of protein and gene names 
                  (PGNs) from the scientific literature requires semantic 
                  resources: Terminological and lexical resources deliver the 
                  term candidates into PGN tagging solutions and the gold 
                  standard corpora (GSC) train them to identify term 
                  parameters and contextual features. Ideally all three 
                  resources, i.e. corpora, lexica and taggers, cover the same 
                  domain knowledge, and thus support identification of the 
                  same types of PGNs and cover all of them. Unfortunately, 
                  none of the three serves as a predominant standard and for 
                  this reason it is worth exploring, how these three resources 
                  comply with each other. We systematically compare different 
                  PGN taggers against publicly available corpora and analyze 
                  the impact of the included lexical resource in their 
                  performance. In particular, we determine the performance 
                  gains through false positive filtering, which contributes to 
                  the disambiguation of identified PGNs. RESULTS: In general, 
                  machine learning approaches (ML-Tag) for PGN tagging show 
                  higher F1-measure performance against the BioCreative-II and 
                  Jnlpba GSCs (exact matching), whereas the lexicon based 
                  approaches (LexTag) in combination with disambiguation 
                  methods show better results on FsuPrge and PennBio. The 
                  ML-Tag solutions balance precision and recall, whereas the 
                  LexTag solutions have different precision and recall 
                  profiles at the same F1-measure across all corpora. Higher 
                  recall is achieved with larger lexical resources, which also 
                  introduce more noise (false positive results). The ML-Tag 
                  solutions certainly perform best, if the test corpus is from 
                  the same GSC as the training corpus. As expected, the false 
                  negative errors characterize the test corpora and - on the 
                  other hand - the profiles of the false positive mistakes 
                  characterize the tagging solutions. Lex-Tag solutions that 
                  are based on a large terminological resource in combination 
                  with false positive filtering produce better results, which, 
                  in addition, provide concept identifiers from a knowledge 
                  source in contrast to ML-Tag solutions. CONCLUSION: The 
                  standard ML-Tag solutions achieve high performance, but not 
                  across all corpora, and thus should be trained using several 
                  different corpora to reduce possible biases. The LexTag 
                  solutions have different profiles for their precision and 
                  recall performance, but with similar F1-measure. This result 
                  is surprising and suggests that they cover a portion of the 
                  most common naming standards, but cope differently with the 
                  term variability across the corpora. The false positive 
                  filtering applied to LexTag solutions does improve the 
                  results by increasing their precision without compromising 
                  significantly their recall. The harmonisation of the 
                  annotation schemes in combination with standardized lexical 
                  resources in the tagging solutions will enable their 
                  comparability and will pave the way for a shared standard.}
}