Další formáty:
BibTeX
LaTeX
RIS
@inproceedings{2227162, author = {Signoroni, Edoardo and Rychlý, Pavel}, address = {Gyeongju, Republic of Korea}, booktitle = {Proceedings of the Fifth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2022)}, editor = {Atul Kr. Ojha, Chao-Hong Liu, Ekaterina Vylomova, Jade Abbott, Jonathan Washington, Nathaniel Oco, Tommi A Pirinen, Valentin Malykh, Varvara Logacheva, Xiaobing Zhao}, keywords = {Machine Translation; Tokenization}, howpublished = {elektronická verze "online"}, language = {eng}, location = {Gyeongju, Republic of Korea}, pages = {56-63}, publisher = {Association for Computational Linguistics}, title = {HFT: High Frequency Tokens for Low-Resource NMT}, url = {https://aclanthology.org/2022.loresmt-1.8}, year = {2022} }
TY - JOUR ID - 2227162 AU - Signoroni, Edoardo - Rychlý, Pavel PY - 2022 TI - HFT: High Frequency Tokens for Low-Resource NMT PB - Association for Computational Linguistics CY - Gyeongju, Republic of Korea KW - Machine Translation KW - Tokenization UR - https://aclanthology.org/2022.loresmt-1.8 N2 - Tokenization has been shown to impact the quality of downstream tasks, such as Neural Machine Translation (NMT), which is susceptible to out-of-vocabulary words and low frequency training data. Current state-of-the-art algorithms have been helpful in addressing the issues of out-of-vocabulary words, bigger vocabulary sizes and token frequency by implementing subword segmentation. We argue, however, that there is still room for improvement, in particular regarding low-frequency tokens in the training data. In this paper, we present “High Frequency Tokenizer”, or HFT, a new language-independent subword segmentation algorithm that addresses this issue. We also propose a new metric to measure the frequency coverage of a tokenizer’s vocabulary, based on a frequency rank weighted average of the frequency values of its items. We experiment with a diverse set of language corpora, vocabulary sizes, and writing systems and report improvements on both frequency statistics and on the average length of the output. We also observe a positive impact on downstream NMT. ER -
SIGNORONI, Edoardo a Pavel RYCHLÝ. HFT: High Frequency Tokens for Low-Resource NMT. Online. In Atul Kr. Ojha, Chao-Hong Liu, Ekaterina Vylomova, Jade Abbott, Jonathan Washington, Nathaniel Oco, Tommi A Pirinen, Valentin Malykh, Varvara Logacheva, Xiaobing Zhao. \textit{Proceedings of the Fifth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2022)}. Gyeongju, Republic of Korea: Association for Computational Linguistics, 2022, s.~56-63. ISSN~2951-2093.
|