@TechReport{Samuelsson:1996_2,
AUTHOR = {Samuelsson, Christer},
TITLE = {Relating Turing's Formula and Zipf's Law},
YEAR = {1996},
MONTH = {June},
NUMBER = {78},
ADDRESS = {Saarbrücken},
TYPE = {CLAUS-Report},
INSTITUTION = {Universität des Saarlandes},
URL = {ftp://ftp.coli.uni-sb.de/pub/coli/claus/claus78.ps},
ABSTRACT = {A general, practical method for handling sparse data that avoids held-out data and iterative reestimation is derived from first principles. It has been tested on a part-of-speech tagging task and outperformed linear (deleted) interpolation even when the latter used a globally optimal parameter setting determined a posteriori. An asymptote is derived from Turing's local reestimation formula for population frequencies, and a local reestimation formula is derived from Zipf's law for the asymptotic behavior of population frequencies. The two are shown to be qualitatively different asymptotically, but nevertheless to be instances of a common class of reestimation-formula-asymptote pairs, in which they constitute the upper and lower bounds of the convergence region of the cumulative of the frequency function, as rank tends to infinity. The results demonstrate that Turing's formula is qualitatively different from the various extensions to Zipf's law, and suggest that it smooths the frequency estimates towards a geometric distribution.},
ANNOTE = {COLIURL : Samuelsson:1996:RTFb.pdf Samuelsson:1996:RTFb.ps} }
|