Other formats:
BibTeX
LaTeX
RIS
@article{1533903, author = {Nevěřilová, Zuzana}, article_location = {Mexico}, article_number = {3}, doi = {http://dx.doi.org/10.13053/CyS-22-3-3022}, keywords = {Multiword expression; Multi-word expression; MWE; MWE discovery; inter-lingual homographs}, language = {eng}, issn = {1405-5546}, journal = {Computación y Sistemas}, title = {Discovering Continuous Multi-word Expressions in Czech}, url = {http://doi.org/10.13053/CyS-22-3-3022}, volume = {22}, year = {2018} }
TY - JOUR ID - 1533903 AU - Nevěřilová, Zuzana PY - 2018 TI - Discovering Continuous Multi-word Expressions in Czech JF - Computación y Sistemas VL - 22 IS - 3 SP - 845-852 EP - 845-852 PB - Centro de Investigación en Computación SN - 14055546 KW - Multiword expression KW - Multi-word expression KW - MWE KW - MWE discovery KW - inter-lingual homographs UR - http://doi.org/10.13053/CyS-22-3-3022 L2 - http://doi.org/10.13053/CyS-22-3-3022 N2 - Multi-word expressions frequently cause incorrect annotations in corpora, since they often contain foreign words or syntactic anomalies. In case of foreign material, the annotation quality depends on whether the correct language of the sequence is detected. In case of inter-lingual homographs, this problem becomes difficult. In the previous work, we created a dataset of Czech continuous multi-word expressions (MWEs). The candidates were discovered automatically from Czech web corpus considering their orthographic variability. The candidates were classified and annotated manually. Afterwards, the dataset was extended automatically by generating all word forms of those MWEs that were annotated as nouns. In this work, we used the dataset as positive examples, we filtered out negative examples from the MWE candidates. We trained a classifier with mean accuracy 92.7%. We have shown that the combined approach slightly outperforms approaches concerning only association measures mainly on MWEs containing inter-lingual homographs and out-of-vocabulary words. The discovery methods can be applied to other languages which encounter orthographic variability in web corpora. ER -
NEVĚŘILOVÁ, Zuzana. Discovering Continuous Multi-word Expressions in Czech. \textit{Computación y Sistemas}. Mexico: Centro de Investigación en Computación, 2018, vol.~22, No~3, p.~845-852. ISSN~1405-5546. Available from: https://dx.doi.org/10.13053/CyS-22-3-3022.
|