Other formats:
BibTeX
LaTeX
RIS
@inproceedings{2273418, author = {Messina, Nicola and Sedmidubský, Jan and Fabrizio, Falchi and Rebok, Tomáš}, address = {New York, NY, USA}, booktitle = {46th International Conference on Research and Development in Information Retrieval (SIGIR)}, doi = {http://dx.doi.org/10.1145/3539618.3592069}, keywords = {human motion data;skeleton sequences;CLIP;BERT;deep language models;ViViT;motion retrieval;cross-modal retrieval}, howpublished = {elektronická verze "online"}, language = {eng}, location = {New York, NY, USA}, isbn = {978-1-4503-9408-6}, note = {Best Short Paper Award Honorable Mention}, pages = {2420-2425}, publisher = {Association for Computing Machinery}, title = {Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion Data and Natural Language}, url = {https://doi.org/10.1145/3539618.3592069}, year = {2023} }
TY - JOUR ID - 2273418 AU - Messina, Nicola - Sedmidubský, Jan - Fabrizio, Falchi - Rebok, Tomáš PY - 2023 TI - Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion Data and Natural Language PB - Association for Computing Machinery CY - New York, NY, USA SN - 9781450394086 N1 - Best Short Paper Award Honorable Mention KW - human motion data;skeleton sequences;CLIP;BERT;deep language models;ViViT;motion retrieval;cross-modal retrieval UR - https://doi.org/10.1145/3539618.3592069 N2 - Due to recent advances in pose-estimation methods, human motion can be extracted from a common video in the form of 3D skeleton sequences. Despite wonderful application opportunities, effective and efficient content-based access to large volumes of such spatio-temporal skeleton data still remains a challenging problem. In this paper, we propose a novel content-based text-to-motion retrieval task, which aims at retrieving relevant motions based on a specified natural-language textual description. To define baselines for this uncharted task, we employ the BERT and CLIP language representations to encode the text modality and successful spatio-temporal models to encode the motion modality. We additionally introduce our transformer-based approach, called Motion Transformer (MoT), which employs divided space-time attention to effectively aggregate the different skeleton joints in space and time. Inspired by the recent progress in text-to-image/video matching, we experiment with two widely-adopted metric-learning loss functions. Finally, we set up a common evaluation protocol by defining qualitative metrics for assessing the quality of the retrieved motions, targeting the two recently-introduced KIT Motion-Language and HumanML3D datasets. The code for reproducing our results is available here: https://github.com/mesnico/text-to-motion-retrieval. ER -
MESSINA, Nicola, Jan SEDMIDUBSKÝ, Falchi FABRIZIO and Tomáš REBOK. Text-to-Motion Retrieval: Towards Joint Understanding of Human Motion Data and Natural Language. Online. In \textit{46th International Conference on Research and Development in Information Retrieval (SIGIR)}. New York, NY, USA: Association for Computing Machinery, 2023, p.~2420-2425. ISBN~978-1-4503-9408-6. Available from: https://dx.doi.org/10.1145/3539618.3592069.
|