<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Towards Italian Sign Language Generation for digital humans</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Emanuele</forename><surname>Colonna</surname></persName>
							<email>emanuele.colonna@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Computer Science</orgName>
								<orgName type="institution">University of Bari Aldo Moro</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Alessandro</forename><surname>Arezzo</surname></persName>
							<email>arezzo@quest-it.com</email>
							<affiliation key="aff1">
								<orgName type="institution">QuestIT S.r.l</orgName>
								<address>
									<settlement>Siena</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Domenico</forename><surname>Roberto</surname></persName>
							<email>d.roberto8@studenti.uniba.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Computer Science</orgName>
								<orgName type="institution">University of Bari Aldo Moro</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">David</forename><surname>Landi</surname></persName>
							<email>d.landi@quest-it.com</email>
							<affiliation key="aff1">
								<orgName type="institution">QuestIT S.r.l</orgName>
								<address>
									<settlement>Siena</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Felice</forename><surname>Vitulano</surname></persName>
							<email>felice.vitulano@quest-it.com</email>
							<affiliation key="aff1">
								<orgName type="institution">QuestIT S.r.l</orgName>
								<address>
									<settlement>Siena</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Gennaro</forename><surname>Vessio</surname></persName>
							<email>gennaro.vessio@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Computer Science</orgName>
								<orgName type="institution">University of Bari Aldo Moro</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Giovanna</forename><surname>Castellano</surname></persName>
							<email>giovanna.castellano@uniba.it</email>
							<affiliation key="aff0">
								<orgName type="department">Department of Computer Science</orgName>
								<orgName type="institution">University of Bari Aldo Moro</orgName>
								<address>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Towards Italian Sign Language Generation for digital humans</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">3CB97FB638E705C625A61D763B8A5BEF</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T16:37+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>Sign language generation, Human pose estimation, Digital humans, Inclusive technology (G. Castellano) 0009-0009-0932-3424 (E. Colonna)</term>
					<term>0009-0002-8896-7840 (D. Roberto)</term>
					<term>0009-0006-6642-1918 (D. Landi)</term>
					<term>0000-0002-0883-2691 (G. Vessio)</term>
					<term>0000-0002-6489-8628 (G. Castellano)</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>In the rapidly evolving field of human-computer interaction, the need for inclusive and accessible communication methods has become increasingly vital. This paper introduces an early exploration of Text-to-LIS, a new model designed to generate contextually accurate Italian Sign Language (LIS) gestures for digital humans. Our approach addresses the importance of non-verbal communication in virtual environments, focusing on enhancing interaction for the deaf and hard-of-hearing community. The core contribution of this work is developing an iterative framework that leverages a comprehensive multimodal dataset, integrating textual and audio inputs with visual data. Utilizing state-of-the-art deep learning algorithms and advanced human pose estimation techniques, the framework enables the progressive refinement of generated gestures, ensuring realism and contextual relevance. The potential applications of the Text-to-LIS model are wide-ranging, from improving accessibility in digital environments to supporting educational tools and promoting LIS in the digital age. The code is publicly available at: https://github.com/CarpiDiem98/text-to-lis/.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>The advancement of graphics and robotics technology has significantly contributed to the rise of virtual and socially intelligent agents, making them increasingly popular for human interaction. This progress has enabled the development of artificial agents with either virtual or physical embodiments, such as avatars or robots, capable of interacting with humans across diverse settings. Among these, digital humans are particularly impactful, replicating human form and behavior within virtual environments <ref type="bibr" target="#b1">[2]</ref>.</p><p>A key component of effective interaction with digital humans is nonverbal communication, which includes facial expressions, gestures, and body language <ref type="bibr" target="#b2">[3]</ref>. Gestures, especially co-speech gestures that accompany verbal communication, enhance these agents' realism and engagement. However, automatically generating natural and synchronized gestures remains a significant challenge due to the complexity and diversity of human nonverbal communication <ref type="bibr" target="#b3">[4]</ref>.</p><p>In this context, sign languages such as Italian Sign Language (LIS) introduce an even more complex dimension of nonverbal communication. Sign languages are not simply gestures but fully developed languages that serve as the primary means of communication for the deaf and hard-of-hearing community. This paper addresses the challenge of generating realistic LIS gestures for digital human agents, recognizing sign languages' critical role in communication and the unique needs of the deaf community.</p><p>Specifically, we propose a novel approach that employs an iterative refinement process, training a model on a comprehensive dataset of text and image pairs representing LIS signs (Fig. <ref type="figure" target="#fig_1">1</ref>). Our approach integrates textual descriptions and visual data to generate accurate and expressive LIS gestures. The  </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Text-to-LIS Pipeline</head><p>Step Iteration model has five main parts: a text encoder, which uses Transformers; a pose encoder, which handles poses; a pose-text encoder, which combines the two; a step encoder, which makes refinements at each step; and a projection module, which produces the final poses. This model captures linguistic and visual aspects of LIS signs. The iterative process begins with an initial generic pose and progresses through multiple steps. Advanced human pose estimation techniques serve as the ground truth for our dataset, allowing for the precise capture and translation of human body movements into 3D animations for virtual models. We present a robust solution for creating natural and coherent LIS gestures by combining textual and visual data. By advancing technology for LIS gesture generation, we aim to achieve several important goals. First, we seek to improve accessibility by generating accurate and natural LIS gestures, which can enhance communication tools for the deaf community, making digital content and interactions more accessible. Additionally, our work promotes LIS, a minority language facing challenges in preservation and promotion, by contributing to its digital representation and documentation, thus supporting its importance in the digital era. Another significant aim is to enhance education; accurate LIS gesture generation can serve as a valuable resource for educational tools, helping deaf individuals learn written Italian and aiding hearing individuals in acquiring LIS. Moreover, as virtual and augmented reality technologies become prominent, it is crucial to ensure that LIS users can fully participate in these digital environments, fostering inclusivity. Lastly, our model and dataset offer valuable resources for linguistic research, particularly for scholars studying the structure and patterns of LIS, thereby contributing to the broader understanding of sign languages.</p><p>The rest of this paper is structured as follows. Section 2 reviews the existing literature. Section 3 introduces the proposed dataset. Section 4 details the proposed Text-to-LIS model. Section 5 presents preliminary results and discusses future work directions.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Related work</head><p>Our Text-to-LIS model builds on several areas of research, including pose extraction, sign language datasets, gesture generation, and Italian Sign Language research. This section provides an overview of the relevant work in these fields.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Pose extraction</head><p>Pose extraction is essential for creating realistic digital humans, as it captures and translates human body movements into 3D animations. Using computer vision techniques, pose estimation methods infer human poses from images without requiring markers. These methods are typically categorized into whole-body and single-part estimations, each with specific challenges.</p><p>Several models have emerged to reconstruct human body posture from a single image. PIXIE <ref type="bibr" target="#b4">[5]</ref> generates complete 3D models even with challenging poses or incomplete body information. Hand4Whole <ref type="bibr" target="#b5">[6]</ref> simultaneously estimates both the full-body and hand poses, outperforming prior methods such as FrankMocap <ref type="bibr" target="#b6">[7]</ref> and PIXIE. PyMAF-X <ref type="bibr" target="#b7">[8]</ref> improves accuracy and speed, estimating SMPL-X parameters with detailed joint rotation and depth information. SMPL-X (Skinned Multi-Person Linear model with eXpressive hands and face) is a comprehensive 3D human body model that integrates detailed representations of the body, face, and hands <ref type="bibr" target="#b8">[9]</ref>. SMPL-X parameters are the values used to configure this model, including joint angles, body shape coefficients, and facial expression parameters.</p><p>Hand pose estimation has also advanced significantly. Early approaches like those by Baek et al. <ref type="bibr" target="#b9">[10]</ref> and Boukhayma et al. <ref type="bibr" target="#b10">[11]</ref> used parametric hand models such as MANO <ref type="bibr" target="#b11">[12]</ref> to match hand shapes to images. Later methods, such as the already mentioned PyMAF-X <ref type="bibr" target="#b7">[8]</ref>, moved away from predefined models, directly predicting the 3D shape of the hand point by point, allowing for greater detail and flexibility.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Sign language datasets</head><p>While several datasets exist for various sign languages, there remains a need for more extensive and diverse resources, especially for the Italian language. Notable sign language datasets include:</p><p>• RWTH-Phoenix-2014T: A German Sign Language (DGS) dataset with approximately 11 hours of content <ref type="bibr" target="#b12">[13]</ref>. • Boston104: An American Sign Language (ASL) dataset with about 9 hours of video <ref type="bibr" target="#b13">[14]</ref>. • How2Sign: A large-scale multimodal ASL dataset with 79 hours of content <ref type="bibr" target="#b14">[15]</ref>. • TGLIS-227: A LIS dataset with approximately 19 hours of video <ref type="bibr" target="#b15">[16]</ref>.</p><p>Other LIS datasets, such as those in <ref type="bibr" target="#b16">[17,</ref><ref type="bibr" target="#b17">18]</ref>, are private or partially accessible. Our work aims to complement and extend these existing resources by providing a novel and comprehensive multimodal dataset for LIS, including video, audio, text, and extracted key points.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Gesture generation</head><p>Recent advancements in gesture generation have focused on creating more natural and context-aware movements. Yoon et al. <ref type="bibr" target="#b2">[3]</ref> proposed generating speech gestures using trimodal context, incorporating text, audio, and speaker identity. Their approach highlights the importance of considering multiple modalities for realistic gesture synthesis. Similarly, Yang et al. <ref type="bibr" target="#b3">[4]</ref> introduced DiffuseStyleGesture, a diffusion-based model for generating stylized co-speech gestures, demonstrating the potential of advanced generative models for creating diverse and expressive movements.</p><p>In the context of sign language generation, Shi et al. <ref type="bibr" target="#b18">[19]</ref> developed an open-domain sign language translation system learned from online videos, showcasing the feasibility of generating sign language from large-scale web data. Our Text-to-LIS model builds on these advancements by incorporating iterative refinement and utilizing textual and visual information to generate accurate and expressive LIS gestures.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4.">Italian Sign Language research</head><p>Research on LIS is growing, but there is still a need for more comprehensive studies and resources. Marchisio et al. <ref type="bibr" target="#b16">[17]</ref> introduced deep learning techniques with data augmentation for LIS recognition. Fagiani et al. <ref type="bibr" target="#b17">[18]</ref> contributed by creating a new LIS database, adding to the resources available for LIS research. Bertoldi et al. <ref type="bibr" target="#b15">[16]</ref> developed a large-scale Italian-LIS parallel corpus, which has been valuable for machine translation and linguistic studies. However, their work primarily focused on text-based representations rather than visual gesture generation.</p><p>Our research extends these efforts by creating a more comprehensive LIS dataset and developing a model specifically designed for generating realistic LIS gestures from textual input. This work bridges the gap between textual representations and visual sign language production, contributing to computational linguistics and assistive technology.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Proposed dataset</head><p>Our proposed dataset is a comprehensive, multimodal collection designed to advance research and application development in Italian Sign Language. It addresses the scarcity of publicly available LIS data and supports various applications, including human movement analysis, nonverbal communication recognition, and understanding human behavior in digital environments.</p><p>The dataset includes approximately 37 hours of LIS content:</p><p>• Video: High-quality video recordings of signers performing LIS during TV news broadcasts, segmented to align with spoken phrases. • Audio: Corresponding audio recordings, including the signer's voice and ambient news sounds.</p><p>• Text: Transcriptions of the spoken content, initially generated using Whisper <ref type="bibr" target="#b19">[20]</ref> and manually corrected for accuracy. • Key points: Body and hand joint positions, stored in pickle file format for each frame of the videos.</p><p>The segmented videos were generated based on transcriptions produced by Whisper. To streamline the automated process, no preprocessing was applied to the transcription output. As noted qualitatively, glossary extraction techniques, common in many datasets, were not applied as they can potentially decrease the deaf community's understanding of the movement. In this dataset, a whole sentence is considered text. We utilized a fully automated web scraping mechanism to gather LIS news broadcast videos from multiple platforms, primarily YouTube, while ensuring compliance with privacy regulations. This approach allowed us to collect diverse signers and contexts, enhancing the dataset's diversity and representativeness. For key point extraction, we employed two state-of-the-art techniques:</p><p>• Hybrik-X <ref type="bibr" target="#b20">[21]</ref>: Known for its accuracy and robustness, Hybrik-X is optimized for real-time execution on mobile devices and performs well in high-detail scenarios. • HaMeR <ref type="bibr" target="#b21">[22]</ref>: HaMeR reconstructs a 3D hand mesh from a single RGB image, utilizing a Vision Transformer (ViT) <ref type="bibr" target="#b22">[23]</ref> for detailed hand pose estimation.</p><p>The extracted key points were normalized using the SMPL-X model <ref type="bibr" target="#b8">[9]</ref>, ensuring consistency between the body and hand models. Figure <ref type="figure" target="#fig_2">2</ref> shows an overview of the multiple modalities collected in our dataset.</p><p>Compared to current state-of-the-art sign language datasets, our LIS dataset stands out in its multimodal nature and substantial duration (see Table <ref type="table">1</ref>). While other LIS datasets exist, they are often either private or limited in accessibility <ref type="bibr" target="#b16">[17,</ref><ref type="bibr" target="#b17">18]</ref>. We aim to continually expand this dataset to enhance its utility for LIS research.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Speech Signal Italian Transcription</head><p>Il comitato scientifico che si occuperà della promozione e valorizzazione del monumento... </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Sign Video</head><note type="other">Pose Keypoints</note></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Overview of publicly available sign language datasets, including ours.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Dataset Language Duration (h) Modalities</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Multiview Transcription Gloss Pose Depth Speech</head><formula xml:id="formula_0">RWTH-Phoenix-2014T DGS 11 ✗ ✓ ✗ ✓ ✓ ✗ Boston104 ASL ≈ 9 ✗ ✓ ✓ ✗ ✗ ✗ How2Sign ASL 79 ✓ ✓ ✓ ✓ ✓ ✓ TGLIS-227 LIS ≈ 19 ✗ ✓ ✗ ✓ ✗ ✓ LIS (ours) LIS ≈ 37 ✗ ✓ ✗ ✓ ✗ ✓</formula></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Proposed method</head><p>This section presents our Text-to-LIS model for automatic gesture generation based on textual descriptions. Our approach builds on the work of Zhang et al. <ref type="bibr" target="#b23">[24]</ref>, employing an iterative refinement process to generate a sequence of poses from textual input generated by automatic transcription <ref type="bibr" target="#b19">[20]</ref>. The key innovation of our method lies in its ability to progressively enhance pose quality through multiple refinement steps, leveraging both textual and positional information.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.">Model architecture</head><p>The core components of our Text-to-LIS model, shown in Fig. <ref type="figure" target="#fig_1">1</ref>, include:</p><p>• The process is iterated, with each iteration taking the output from the previous step as its new input. This iterative approach attempts to translate sentences into fluid and accurate movements effectively.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2.">Iterative refinement process</head><p>The iterative refinement process is the heart of our Text-to-LIS model. It works similarly to an artist creating a painting, starting with a rough sketch and gradually refining it through multiple steps until a detailed work of art emerges. The process begins with a textual input (a description of a gesture in LIS) and an initial generic pose, which serves as a foundation. From this starting point, the model iterates through a series of refinements, progressively improving the pose. At each iteration, the text and current pose are processed by their respective encoders, allowing the model to "understand" both the description and the current pose. The step encoder keeps track of the progress made so far, integrating information from previous refinements. Based on this understanding, the model outputs an improved pose version. This process repeats over several iterations, with each cycle producing a more accurate and detailed representation of the LIS gesture described in the text. This gradual refinement allows the model to capture subtle nuances and correct errors step by step, leading to more natural and expressive gesture generation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.3.">Training procedure</head><p>Training our Text-to-LIS model involves strategies to facilitate effective learning, creativity, and precision. Two key techniques used during training are:</p><p>• Teacher forcing <ref type="bibr" target="#b24">[25]</ref>: Similar to guiding an apprentice artist, this technique alternates between allowing the model to make its predictions and providing the correct pose for the next step. This approach enables the model to learn from independent attempts and supervised guidance, improving its ability to generate accurate poses. • Controlled noise injection: To improve robustness and flexibility, we introduce random variations (or "noise") into the poses during training. This involves adding small Gaussian noise to joint positions. This is akin to practicing under different conditions-such as using different brushes or lighting in art-which helps prevent overfitting and encourages the model to learn the underlying structure of LIS gestures.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Preliminary results and future work</head><p>We conducted exploratory experiments training the model for 200 epochs with a batch size of 16. Our analyses were performed on a subset of the dataset, consisting of approximately six thousand videos, each with an average duration of ten seconds. Table <ref type="table" target="#tab_1">2</ref> summarizes the hyperparameters used during model training and evaluation. Two loss functions were employed to evaluate the model's performance: the MPJPE (Mean per Joint Position Error) and a refined loss specifically designed to account for the model's confidence in each predicted pose point. To define this loss function, first, the squared error between the ground truth pose 𝑃 𝑗 𝑖 and the predicted pose 𝑃 ^𝑗 𝑖 is calculated:</p><formula xml:id="formula_1">𝐸 𝑗 𝑖 = ‖𝑃 ^𝑗 𝑖 − 𝑃 𝑗 𝑖 ‖ 2 . (<label>1</label></formula><formula xml:id="formula_2">)</formula><p>This error is then weighted by a confidence vector 𝐶 𝑗 𝑖 that represents the model's certainty about each predicted joint position, leading to the loss function:</p><formula xml:id="formula_3">𝐿 𝑗 𝑖 = 𝐶 𝑗 𝑖 ‖𝑃 ^𝑗 𝑖 − 𝑃 𝑗 𝑖 ‖ 2 . (<label>2</label></formula><formula xml:id="formula_4">)</formula><p>This loss function enables the model to prioritize joints with higher confidence while assigning less weight to uncertain predictions. Finally, the mean weighted error is calculated and normalized, yielding the final refined loss: where 𝑁 represents the number of the samples in the batch multiplied by the number of the joints in each pose 𝐽. A key feature of this loss is the normalization based on the number of model steps 𝑆, computed with the logarithmic function log(𝑆 + 1). The MPJPE is a widely used metric for assessing the accuracy of 3D pose estimation. It quantifies the average discrepancy between predicted and actual joint positions across all samples:</p><formula xml:id="formula_5">𝐿 refined = 1 𝑁 • 𝐽 𝑁 ∑︁ 𝑖=1 𝐽 ∑︁ 𝑗=1 𝐿 𝑗 𝑖 • log(𝑆 + 1),<label>(3)</label></formula><formula xml:id="formula_6">MPJPE = 1 𝑁 • 𝐽 𝑁 ∑︁ 𝑖=1 𝐽 ∑︁ 𝑗=1 ‖𝑃 ^𝑗 𝑖 − 𝑃 𝑗 𝑖 ‖ 2 ,<label>(4)</label></formula><p>where 𝑃 ^𝑗 𝑖 represents the predicted 3D coordinate for joint 𝑗 of sample 𝑖, 𝑃 𝑗 𝑖 is the corresponding ground truth, 𝑁 is the number of samples, and 𝐽 is the number of joints per sample.</p><p>We conducted experiments comparing different configurations to determine the optimal number of refinement steps. As shown in Table <ref type="table" target="#tab_2">3</ref>, increasing the number of refinement steps significantly improves the quality of the generated poses. The improvement was most pronounced up to ten refinement passes, after which further increases produced diminishing returns and significantly increased the generation time. Specifically, with ten refinement passes, the optimal balance between the generated poses' accuracy and the model's computational demands was observed. Each refined step took approximately a few seconds when training the model on a GeForce RTX 4090 graphics card.</p><p>The preliminary results demonstrate the effectiveness of the Text-to-LIS model in generating realistic LIS poses from textual descriptions. The model's iterative refinement approach produces high-quality poses, as evidenced by qualitative evaluation. These results (Fig. <ref type="figure" target="#fig_5">3</ref>) indicate the model's potential as a valuable tool for enhancing digital human interactions, virtual reality environments, and nonverbal communication systems.</p><p>While the results are promising, several avenues for further research and development remain. Expanding the dataset with more diverse signers, gestures, and contexts is essential to improve the model's generalization capabilities. On the technical side, investigating advanced attention mechanisms and temporal modules may help the model better capture long-term dependencies and subtle nuances in gestures. Real-time sign language generation is another critical goal for practical applications,  and techniques like model pruning and quantization could reduce computational complexity without sacrificing accuracy. Given that sign language communication is inherently multimodal, future work should also focus on integrating hand gestures, facial expressions, and body language into a unified model to generate more natural and expressive LIS gestures. Moreover, although the current focus is on LIS, the techniques developed in this research could be adapted to other sign languages or nonverbal communication systems, broadening the scope and impact of this work.</p><p>Finally, collaboration with the deaf community, linguists, and technologists will be essential to ensure that our advancements are both technically sound and socially impactful.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Pipeline of the Text-to-LIS framework. The red section shows the human pose estimation system for generating pseudo-ground truth. The gray section depicts the Text-to-LIS model for generating LIS motion from text. The orange section indicates the metrics for calculating the loss and improving pose quality and accuracy.</figDesc><graphic coords="2,355.86,259.96,67.59,90.12" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: Overview of our LIS dataset, comprising approximately 37 hours of LIS videos, including multiple modalities such as video, audio, text, and key points.</figDesc><graphic coords="5,431.67,90.79,72.35,95.74" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head></head><label></label><figDesc>Text encoder: A Transformer-based encoder that processes text embeddings to generate a dense representation of the input text. It uses multi-head attention mechanisms and feed-forward neural networks to capture the contextual relationships between tokens. The text encoder receives the corresponding phrase of the LIS gesture as its input. • Pose encoder: A Transformer-based encoder designed to handle the sequence of poses. This encoder applies attention mechanisms to the current state of the gesture (which, in the initial iteration, is a generic starting pose) to represent the directional matrices. • Pose-text encoder: This component combines and processes the joint information from text and pose data. • Step encoder: A small neural network representing the current iterative process step. It refines this representation with embedding layers, integrating information from previous steps to inform subsequent pose adjustments. • Projection module: This module transforms hidden representations into final poses, mapping the refined poses back into the appropriate output space.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 3 :</head><label>3</label><figDesc>Figure 3:The example illustrates the generation of LIS poses from textual input. The input is taken from the text "in maniera gravissima un altro ragazzo, è avvenuto martedì sera di fronte al santuario di Fosciandora, il dolore di tutta la comunità" (in english: "very serious way another boy, happened Tuesday night in front of the sanctuary of Fosciandora, the pain of the whole community"). The Text-to-LIS model was employed to generate the LIS poses.</figDesc><graphic coords="8,129.35,458.00,83.46,110.44" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 2</head><label>2</label><figDesc>Hyperparameters used in model training.</figDesc><table><row><cell cols="2">Category Hyperparameter</cell><cell>Value</cell></row><row><cell>Generic</cell><cell>Seed Batch size</cell><cell>42 16</cell></row><row><cell></cell><cell>Max sequence size</cell><cell>10000</cell></row><row><cell>Sequence</cell><cell>Noise epsilon</cell><cell>1e-4</cell></row><row><cell></cell><cell>Sequence length weight in loss calculation</cell><cell>2e-5</cell></row><row><cell></cell><cell>Dimension of hidden encoder</cell><cell>128</cell></row><row><cell></cell><cell># Text encoder layers</cell><cell>2</cell></row><row><cell>Model</cell><cell># Pose encoder layers</cell><cell>4</cell></row><row><cell></cell><cell># Pose refinement steps</cell><cell>10</cell></row><row><cell></cell><cell>Encoder feed-forward size</cell><cell>2048</cell></row><row><cell cols="2">Optimizer Adam learning rate</cell><cell>1e-3</cell></row></table></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_2"><head>Table 3</head><label>3</label><figDesc>Comparison of the number of refinement steps and the quality of the generated poses.</figDesc><table><row><cell cols="3"># Steps Refined loss (Train) MPJPE (Test)</cell></row><row><cell>1</cell><cell>0.07</cell><cell>0.20</cell></row><row><cell>10</cell><cell>0.12</cell><cell>0.10</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This research was supported by a PhD fellowship awarded to Emanuele Colonna, funded under the Italian National Recovery and Resilience Plan (D.M. n. 117/23), Mission 4, Component 2, Investment 3.3. The PhD project, titled "Study of AI Techniques for Efficient Generation of Digital Humans and 3D Environments" (CUP H91I23000690007), is co-funded by QuestIT S.r.l. Additionally, this research was partially supported by the UNIBA-MAML (Microsoft Azure Machine Learning) agreement.</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Preface to the Eighth Workshop on Natural Language for Artificial Intelligence (NL4AI)</title>
		<author>
			<persName><forename type="first">G</forename><surname>Bonetta</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">D</forename><surname>Hromei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Siciliani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">A</forename><surname>Stranisci</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Eighth Workshop on Natural Language for Artificial Intelligence (NL4AI 2024) co-located with 23th International Conference of the Italian Association for Artificial Intelligence (AI*IA 2024)</title>
				<meeting>the Eighth Workshop on Natural Language for Artificial Intelligence (NL4AI 2024) co-located with 23th International Conference of the Italian Association for Artificial Intelligence (AI*IA 2024)</meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">A systematic review on digital human models in assembly process planning</title>
		<author>
			<persName><forename type="first">M.-Y</forename><surname>Yin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-G</forename><surname>Li</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The International Journal of Advanced Manufacturing Technology</title>
		<imprint>
			<biblScope unit="volume">125</biblScope>
			<biblScope unit="page" from="1037" to="1059" />
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Speech gesture generation from the trimodal context of text, audio, and speaker identity</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Yoon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Cha</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J.-H</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Jang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Lee</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">ACM Transactions on Graphics (TOG)</title>
		<imprint>
			<biblScope unit="volume">39</biblScope>
			<biblScope unit="page" from="1" to="16" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">Diffusestylegesture: Stylized audio-driven co-speech gesture generation with diffusion models</title>
		<author>
			<persName><forename type="first">S</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Hao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Bao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Xiao</surname></persName>
		</author>
		<idno type="DOI">10.24963/ijcai.2023/650</idno>
		<ptr target="https://doi.org/10.24963/ijcai.2023/650.doi:10.24963/ijcai.2023/650" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23, International Joint Conferences on Artificial Intelligence Organization</title>
				<meeting>the Thirty-Second International Joint Conference on Artificial Intelligence, IJCAI-23, International Joint Conferences on Artificial Intelligence Organization</meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="5860" to="5868" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Collaborative Regression of Expressive Bodies using Moderation</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Choutas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Bolkart</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Tzionas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">J</forename><surname>Black</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on 3D Vision (3DV)</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Accurate 3D Hand Pose Estimation for Whole-Body 3D Human Mesh Estimation</title>
		<author>
			<persName><forename type="first">G</forename><surname>Moon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Choi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">M</forename><surname>Lee</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Computer Vision and Pattern Recognition Workshop (CVPRW)</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">FrankMocap: A Monocular 3D Whole-Body Pose Estimation System via Regression and Integration</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Rong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Shiratori</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Joo</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE International Conference on Computer Vision Workshops</title>
				<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Pymaf-x: Towards well-aligned full-body model regression from monocular images</title>
		<author>
			<persName><forename type="first">H</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>An</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Pattern Analysis and Machine Intelligence</title>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Expressive Body Capture: 3D Hands, Face, and Body from a Single Image</title>
		<author>
			<persName><forename type="first">G</forename><surname>Pavlakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Choutas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Ghorbani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Bolkart</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">A A</forename><surname>Osman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Tzionas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">J</forename><surname>Black</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)</title>
				<meeting>IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Pushing the envelope for rgb-based dense 3d hand pose estimation via neural rendering</title>
		<author>
			<persName><forename type="first">S</forename><surname>Baek</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><forename type="middle">I</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T.-K</forename><surname>Kim</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</title>
				<meeting>the IEEE/CVF Conference on Computer Vision and Pattern Recognition</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="1067" to="1076" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">3d hand shape and pose from images in the wild</title>
		<author>
			<persName><forename type="first">A</forename><surname>Boukhayma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">D</forename><surname>Bem</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><forename type="middle">H</forename><surname>Torr</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</title>
				<meeting>the IEEE/CVF Conference on Computer Vision and Pattern Recognition</meeting>
		<imprint>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="10843" to="10852" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Embodied hands: modeling and capturing hands and bodies together</title>
		<author>
			<persName><forename type="first">J</forename><surname>Romero</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Tzionas</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">J</forename><surname>Black</surname></persName>
		</author>
		<idno type="DOI">10.1145/3130800.3130883</idno>
		<idno>doi:10. 1145/3130800.3130883</idno>
		<ptr target="https://doi.org/10.1145/3130800.3130883" />
	</analytic>
	<monogr>
		<title level="j">ACM Trans. Graph</title>
		<imprint>
			<biblScope unit="volume">36</biblScope>
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<monogr>
		<title level="m" type="main">Extensions of the Sign Language Recognition and Translation Corpus RWTH-PHOENIX-Weather</title>
		<author>
			<persName><forename type="first">J</forename><surname>Forster</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Schmidt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Koller</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Bellgardt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ney</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2014">2014</date>
			<publisher>LREC</publisher>
			<biblScope unit="page" from="1911" to="1916" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Speech recognition techniques for a sign language recognition system</title>
		<author>
			<persName><forename type="first">P</forename><surname>Dreuw</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Rybach</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Deselaers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zahedi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ney</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">hand</title>
		<imprint>
			<biblScope unit="volume">60</biblScope>
			<biblScope unit="page">80</biblScope>
			<date type="published" when="2007">2007</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">How2sign: a large-scale multimodal dataset for continuous american sign language</title>
		<author>
			<persName><forename type="first">A</forename><surname>Duarte</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Palaskar</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Ventura</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Ghadiyaram</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Dehaan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Metze</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Torres</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Giro-I Nieto</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</title>
				<meeting>the IEEE/CVF conference on computer vision and pattern recognition</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="2735" to="2744" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">On the creation and the annotation of a large-scale italian-lis parallel corpus</title>
		<author>
			<persName><forename type="first">N</forename><surname>Bertoldi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Tiotto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Prinetto</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Piccolo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Nunnari</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Lombardo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mazzei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Damiano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Lesmo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Principe</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Language Resources and Evaluation</title>
				<imprint>
			<date type="published" when="2010">2010</date>
			<biblScope unit="page" from="19" to="22" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">Introducing Deep Learning with Data Augmentation and Corpus Construction for LIS</title>
		<author>
			<persName><forename type="first">M</forename><surname>Marchisio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mazzei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Sammaruga</surname></persName>
		</author>
		<ptr target="https://api.semanticscholar.org/CorpusID:266726316" />
	</analytic>
	<monogr>
		<title level="m">Italian Conference on Computational Linguistics</title>
				<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title level="m" type="main">A New Italian Sign Language Database</title>
		<author>
			<persName><forename type="first">M</forename><surname>Fagiani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Squartini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Principi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Piazza</surname></persName>
		</author>
		<idno type="DOI">10.1007/978-3-642-31561-9_18</idno>
		<imprint>
			<date type="published" when="2012">2012</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Open-Domain Sign Language Translation Learned from Online Video</title>
		<author>
			<persName><forename type="first">B</forename><surname>Shi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Brentari</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Shakhnarovich</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Livescu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">EMNLP</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Radford</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">W</forename><surname>Kim</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Brockman</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Mcleavey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Sutskever</surname></persName>
		</author>
		<title level="m">Robust speech recognition via Large-Scale Weak Supervision</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<analytic>
		<title level="a" type="main">Hybrik: A hybrid analytical-neural inverse kinematics solution for 3d human pose and shape estimation</title>
		<author>
			<persName><forename type="first">J</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Bian</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Lu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</title>
				<meeting>the IEEE/CVF Conference on Computer Vision and Pattern Recognition</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="3383" to="3393" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Reconstructing hands in 3d with transformers</title>
		<author>
			<persName><forename type="first">G</forename><surname>Pavlakos</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Shan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">I</forename><surname>Radosavovic</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Kanazawa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Fouhey</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Malik</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</title>
				<meeting>the IEEE/CVF Conference on Computer Vision and Pattern Recognition</meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
			<biblScope unit="page" from="9826" to="9836" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<monogr>
		<title level="m" type="main">An image is worth 16x16 words: Transformers for image recognition at scale</title>
		<author>
			<persName><forename type="first">A</forename><surname>Kolesnikov</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Dosovitskiy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Weissenborn</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Heigold</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Uszkoreit</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Beyer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Minderer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Dehghani</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Houlsby</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gelly</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Unterthiner</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhai</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Cai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Pan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Hong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2208.15001</idno>
		<title level="m">Motiondiffuse: Text-driven human motion generation with diffusion model</title>
				<imprint>
			<date type="published" when="2022">2022</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b24">
	<analytic>
		<title level="a" type="main">Scheduled sampling for sequence prediction with recurrent Neural networks</title>
		<author>
			<persName><forename type="first">S</forename><surname>Bengio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Vinyals</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Jaitly</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Shazeer</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 28th International Conference on Neural Information Processing Systems -Volume 1, NIPS&apos;15</title>
				<meeting>the 28th International Conference on Neural Information Processing Systems -Volume 1, NIPS&apos;15<address><addrLine>Cambridge, MA, USA</addrLine></address></meeting>
		<imprint>
			<publisher>MIT Press</publisher>
			<date type="published" when="2015">2015</date>
			<biblScope unit="page" from="1171" to="1179" />
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
