Další formáty:
BibTeX
LaTeX
RIS
@inproceedings{991165, author = {Pomikálek, Jan and Rychlý, Pavel and Jakubíček, Miloš}, address = {Istanbul, Turkey}, booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Ugur Dogan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis}, keywords = {corpus; clueweb; English; encoding; word sketch}, howpublished = {tištěná verze "print"}, language = {eng}, location = {Istanbul, Turkey}, isbn = {978-2-9517408-7-7}, pages = {502-506}, publisher = {European Language Resources Association (ELRA)}, title = {Building a 70 billion word corpus of English from ClueWeb}, url = {http://nlp.fi.muni.cz/publications/lrec2012_xpomikal_pary_xjakub/lrec2012.pdf}, year = {2012} }
TY - JOUR ID - 991165 AU - Pomikálek, Jan - Rychlý, Pavel - Jakubíček, Miloš PY - 2012 TI - Building a 70 billion word corpus of English from ClueWeb PB - European Language Resources Association (ELRA) CY - Istanbul, Turkey SN - 9782951740877 KW - corpus KW - clueweb KW - English KW - encoding KW - word sketch UR - http://nlp.fi.muni.cz/publications/lrec2012_xpomikal_pary_xjakub/lrec2012.pdf L2 - http://nlp.fi.muni.cz/publications/lrec2012_xpomikal_pary_xjakub/lrec2012.pdf N2 - This work describes the process of creation of a 70 billion word text corpus of English. We used an existing language resource, namely the ClueWeb09 dataset, as source for the corpus data. Processing such a vast amount of data presented several challenges, mainly associated with pre-processing (boilerplate cleaning, text de-duplication) and post-processing (indexing for efficient corpus querying using the CQL – Corpus Query Language) steps. In this paper we explain how we tackled them: we describe the tools used for boilerplate cleaning (jusText) and for de-duplication (onion) that was performed not only on full (document-level) duplicates but also on the level of near-duplicate texts. Moreover we show the impact of each of the performed pre-processing steps on the final corpus size. Furthermore we show how effective parallelization of the corpus indexation procedure was employed within the Manatee corpus management system and during computation of word sketches (one-page, automatic, corpus-derived summaries of a word’s grammatical and collocational behaviour) from the resulting corpus. ER -
POMIKÁLEK, Jan, Pavel RYCHLÝ a Miloš JAKUBÍČEK. Building a 70 billion word corpus of English from ClueWeb. In Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Ugur Dogan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis. \textit{Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)}. Istanbul, Turkey: European Language Resources Association (ELRA), 2012, s.~502-506. ISBN~978-2-9517408-7-7.
|