Další formáty:
BibTeX
LaTeX
RIS
@article{827749, author = {Pomikálek, Jan and Rychlý, Pavel and Kilgarriff, Adam}, article_location = {Mexiko}, article_number = {zima 2009}, keywords = {word corpora; web as corpus; duplicate detection}, language = {eng}, issn = {1870-4069}, journal = {Advances in Computational Linguistics}, title = {Scaling to Billion-plus Word Corpora}, volume = {41}, year = {2009} }
TY - JOUR ID - 827749 AU - Pomikálek, Jan - Rychlý, Pavel - Kilgarriff, Adam PY - 2009 TI - Scaling to Billion-plus Word Corpora JF - Advances in Computational Linguistics VL - 41 IS - zima 2009 SP - 3-13 EP - 3-13 PB - Instituto Politécnico Nacional SN - 18704069 KW - word corpora KW - web as corpus KW - duplicate detection N2 - Most phenomena in natural languages are distributed in accordance with Zipf's law, so many words, phrases and other items occur rarely and we need very large corpora to provide evidence about them. Previous work shows that it is possible to create very large (multi-billion word) corpora from the web. The usability of such corpora is often limited by duplicate contents and a lack of efficient query tools. This paper describes BiWeC, a Big Web Corpus of English texts currently comprising 5.5b words fully processed, and with a target size of 20b. We present a method for detecting near-duplicate text documents in multi-billion-word text collections and describe how one corpus query tool, the Sketch Engine, has been re-engineered to efficiently encode, process and query such corpora on low-cost hardware. ER -
POMIKÁLEK, Jan, Pavel RYCHLÝ a Adam KILGARRIFF. Scaling to Billion-plus Word Corpora. \textit{Advances in Computational Linguistics}. Mexiko: Instituto Politécnico Nacional, 2009, roč.~41, zima 2009, s.~3-13, 14 s. ISSN~1870-4069.
|