Other formats:
BibTeX
LaTeX
RIS
@inproceedings{1075316, author = {Sojka, Petr}, address = {Brno}, booktitle = {Sixth Workshop on Recent Advances in Slavonic Natural Languages Processing, RASLAN 2012}, edition = {první}, editor = {Aleš Horák, Pavel Rychlý}, keywords = {competing patterns;segmentation;hyphenation;NP problems;pattern generation;patgen;context-sensitive patterns;machine learning;natural language engineering;EuDML}, howpublished = {tištěná verze "print"}, language = {eng}, location = {Brno}, isbn = {978-80-263-0313-8}, pages = {121--131}, publisher = {Tribun EU}, title = {Segmentation from 97% to 100%: Is It Time for Some Linguistics?}, url = {http://www.fi.muni.cz/usr/sojka/presentations/sojka-raslan-pres2012.pdf}, year = {2012} }
TY - JOUR ID - 1075316 AU - Sojka, Petr PY - 2012 TI - Segmentation from 97% to 100%: Is It Time for Some Linguistics? PB - Tribun EU CY - Brno SN - 9788026303138 KW - competing patterns;segmentation;hyphenation;NP problems;pattern generation;patgen;context-sensitive patterns;machine learning;natural language engineering;EuDML UR - http://www.fi.muni.cz/usr/sojka/presentations/sojka-raslan-pres2012.pdf L2 - http://www.fi.muni.cz/usr/sojka/papers/sojka-raslan2012.pdf N2 - Many tasks in natural language processing (NLP) require \emph{segmentation} algorithms: segmentation of paragraph into sentences, segmentation of sentences into words is needed in languages like Chinese or Thai, segmentation of words into syllables (\emph{hyphenation}) or into morphological parts (e.g.\ getting word stem for indexing), and many other tasks (e.g.\ tagging) could be formulated as segmentation problems. We evaluate methodology of using \emph{competing patterns} for these tasks and decide on the complexity of creation of space-optimal (minimal) patterns that completely (100\,\%) implement the segmentation task. We formally define this task and prove that it is in the class of \emph{non-polynomial} optimization problems. However, finding space-efficient competing patterns for real NLP tasks is feasible and gives efficient scalable solutions of segmentation task: segmentation is done in \emph{constant} time with respect to the size of segmented dictionary. Constant time of access to segmentations makes competing patterns attractive data structure for many NLP tasks. ER -
SOJKA, Petr. Segmentation from 97\%{} to 100\%{}: Is It Time for Some Linguistics? In Aleš Horák, Pavel Rychlý. \textit{Sixth Workshop on Recent Advances in Slavonic Natural Languages Processing, RASLAN 2012}. první. Brno: Tribun EU, 2012, p.~121--131. ISBN~978-80-263-0313-8.
|