% pubman genre = conference-paper @inproceedings{item_3376883, title = {{Three-stream 3D/1D CNN for fine-grained action classification and segmentation in table tennis}}, author = {Martin, Pierre-Etienne and Benois-Pineau, Jenny and P{\'e}teri, Renaud and Morlier, Julien}, language = {eng}, doi = {10.1145/3475722.3482793}, year = {2021}, abstract = {{This paper proposes a fusion method of modalities extracted from video{\textless}br{\textgreater}through a three-stream network with spatio-temporal and temporal convolutions{\textless}br{\textgreater}for fine-grained action classification in sport. It is applied to TTStroke-21{\textless}br{\textgreater}dataset which consists of untrimmed videos of table tennis games. The goal is{\textless}br{\textgreater}to detect and classify table tennis strokes in the videos, the first step of a{\textless}br{\textgreater}bigger scheme aiming at giving feedback to the players for improving their{\textless}br{\textgreater}performance. The three modalities are raw RGB data, the computed optical flow{\textless}br{\textgreater}and the estimated pose of the player. The network consists of three branches{\textless}br{\textgreater}with attention blocks. Features are fused at the latest stage of the network{\textless}br{\textgreater}using bilinear layers. Compared to previous approaches, the use of three{\textless}br{\textgreater}modalities allows faster convergence and better performances on both tasks:{\textless}br{\textgreater}classification of strokes with known temporal boundaries and joint segmentation{\textless}br{\textgreater}and classification. The pose is also further investigated in order to offer{\textless}br{\textgreater}richer feedback to the athletes.{\textless}br{\textgreater}}}, booktitle = {{MMSports{\textquotesingle}21: Proceedings of the 4th International Workshop on Multimedia Content Analysis in Sports}}, pages = {35--41}, address = {Chengdu, China (Online)}, note = {MMSports {\textquotesingle}21}, }