Další formáty:
BibTeX
LaTeX
RIS
@inproceedings{829305, author = {Řehůřek, Radim and Kolkus, Milan}, address = {Mexico City, Mexico}, booktitle = {Computational Linguistics and Intelligent Text Processing, 10th International Conference, CICLing 2009, Proceedings.}, doi = {http://dx.doi.org/10.1007/978-3-642-00382-0_29}, edition = {první}, keywords = {machine learning; language segmentation; language identification}, howpublished = {tištěná verze "print"}, language = {eng}, location = {Mexico City, Mexico}, isbn = {978-3-642-00381-3}, pages = {357-368}, publisher = {Springer-Verlag}, title = {Language Identification on the Web: Extending the Dictionary Method}, url = {http://www.cicling.org/2009/}, year = {2009} }
TY - JOUR ID - 829305 AU - Řehůřek, Radim - Kolkus, Milan PY - 2009 TI - Language Identification on the Web: Extending the Dictionary Method PB - Springer-Verlag CY - Mexico City, Mexico SN - 9783642003813 KW - machine learning KW - language segmentation KW - language identification UR - http://www.cicling.org/2009/ N2 - Automated language identification of written text is a well-established research domain that has received considerable attention in the past. By now, efficient and effective algorithms based on character $n$-grams are in use, mainly with identification based on Markov Processes or on character $n$-gram profiles. In this paper we investigate the limitations of these approaches when applied to real-world web pages. The challenges to be overcome include language identification on very short texts, correctly handling texts of unknown language and texts comprised of multiple languages. We propose and evaluate a new method, which constructs language models based on word relevance and addresses these limitations. We also extend our method to allow us to efficiently and automatically segment the input text into blocks of individual languages, in case of multiple-language documents. ER -
ŘEHŮŘEK, Radim a Milan KOLKUS. Language Identification on the Web: Extending the Dictionary Method. In \textit{Computational Linguistics and Intelligent Text Processing, 10th International Conference, CICLing 2009, Proceedings.}. první. Mexico City, Mexico: Springer-Verlag, 2009, s.~357-368. ISBN~978-3-642-00381-3. Dostupné z: https://dx.doi.org/10.1007/978-3-642-00382-0\_{}29.
|