% pubman genre = conference-paper
@inproceedings{item_3376883,
title = {{Three-stream 3D/1D CNN for fine-grained action classification and segmentation in table tennis}},
author = {Martin, Pierre-Etienne and Benois-Pineau, Jenny and P{\'e}teri, Renaud and Morlier, Julien},
language = {eng},
doi = {10.1145/3475722.3482793},
year = {2021},
abstract = {{This paper proposes a fusion method of modalities extracted from video{\textless}br{\textgreater}through a three-stream network with spatio-temporal and temporal convolutions{\textless}br{\textgreater}for fine-grained action classification in sport. It is applied to TTStroke-21{\textless}br{\textgreater}dataset which consists of untrimmed videos of table tennis games. The goal is{\textless}br{\textgreater}to detect and classify table tennis strokes in the videos, the first step of a{\textless}br{\textgreater}bigger scheme aiming at giving feedback to the players for improving their{\textless}br{\textgreater}performance. The three modalities are raw RGB data, the computed optical flow{\textless}br{\textgreater}and the estimated pose of the player. The network consists of three branches{\textless}br{\textgreater}with attention blocks. Features are fused at the latest stage of the network{\textless}br{\textgreater}using bilinear layers. Compared to previous approaches, the use of three{\textless}br{\textgreater}modalities allows faster convergence and better performances on both tasks:{\textless}br{\textgreater}classification of strokes with known temporal boundaries and joint segmentation{\textless}br{\textgreater}and classification. The pose is also further investigated in order to offer{\textless}br{\textgreater}richer feedback to the athletes.{\textless}br{\textgreater}}},
booktitle = {{MMSports{\textquotesingle}21: Proceedings of the 4th International Workshop on Multimedia Content Analysis in Sports}},
pages = {35--41},
address = {Chengdu, China (Online)},
note = {MMSports {\textquotesingle}21},
}