Other formats:
BibTeX
LaTeX
RIS
@inproceedings{991660, author = {Suchomel, Vít and Pomikálek, Jan}, address = {Lyon}, booktitle = {Proceedings of the seventh Web as Corpus Workshop (WAC7)}, editor = {Adam Kilgarriff, Serge Sharoff}, keywords = {crawler; web crawling; corpus; web corpus; text corpus}, howpublished = {elektronická verze "online"}, language = {eng}, location = {Lyon}, pages = {39-43}, title = {Efficient Web Crawling for Large Text Corpora}, url = {http://sigwac.org.uk/raw-attachment/wiki/WAC7/wac7-proc.pdf}, year = {2012} }
TY - JOUR ID - 991660 AU - Suchomel, Vít - Pomikálek, Jan PY - 2012 TI - Efficient Web Crawling for Large Text Corpora CY - Lyon KW - crawler KW - web crawling KW - corpus KW - web corpus KW - text corpus UR - http://sigwac.org.uk/raw-attachment/wiki/WAC7/wac7-proc.pdf N2 - Many researchers use texts from the web, an easy source of linguistic data in a great variety of languages. Building both large and good quality text corpora is the challenge we face nowadays. We describe how to deal with inefficient data downloading and how to focus crawling on text rich web domains. We present efficiency figures from crawling texts in American Spanish, Czech, Japanese, Russian, Tajik Persian, Turkish and the sizes of the resulting corpora. The idea has been successfully applied for building billions of words scale corpora in six languages. Texts in the Russian corpus, consisting of 20.2 billions tokens, were downloaded in just 13 days. ER -
SUCHOMEL, Vít and Jan POMIKÁLEK. Efficient Web Crawling for Large Text Corpora. Online. In Adam Kilgarriff, Serge Sharoff. \textit{Proceedings of the seventh Web as Corpus Workshop (WAC7)}. Lyon, 2012, p.~39-43.
|