<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Recurrent Networks are (Linguistically) Better? An Experiment on Small-LM Training on Child-Directed Speech in Italian</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Achille</forename><surname>Fusco</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Matilde</forename><surname>Barbini</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Maria</forename><surname>Letizia</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Piccini</forename><surname>Bianchessi</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Veronica</forename><surname>Bressan</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sofia</forename><surname>Neri</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Sarah</forename><surname>Rossi</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Tommaso</forename><surname>Sgrizzi</surname></persName>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<author role="corresp">
							<persName><forename type="first">Cristiano</forename><surname>Chesi</surname></persName>
							<email>cristiano.chesi@iusspavia.it</email>
							<affiliation key="aff0">
								<orgName type="laboratory">NeTS Lab</orgName>
								<orgName type="institution">IUSS Pavia</orgName>
								<address>
									<addrLine>P.zza Vittoria 15</addrLine>
									<postCode>27100</postCode>
									<settlement>Pavia</settlement>
									<country key="IT">Italy</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Recurrent Networks are (Linguistically) Better? An Experiment on Small-LM Training on Child-Directed Speech in Italian</title>
					</analytic>
					<monogr>
						<idno type="ISSN">1613-0073</idno>
					</monogr>
					<idno type="MD5">A72235413C076F278FAC7B4CA2DAE1A9</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2025-04-23T17:37+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>LSTM</term>
					<term>Transformers</term>
					<term>Small Language Models (SLM)</term>
					<term>tokenization</term>
					<term>cell state control</term>
					<term>LM evaluation</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Here we discuss strategies and results of a small-sized training program based on Italian childdirected speech (less than 3M tokens) for various network architectures. The rationale behind these experiments <ref type="bibr" target="#b0">[1]</ref> lies in the attempt to understand the effect of this naturalistic training diet on different models' architecture. Preliminary findings lead us to conclude that: (i) different tokenization strategies produce mildly significant improvements overall, although segmentation aligns more closely with linguistic intuitions in some cases, but not in others; (ii) modified LSTM networks (eMG-RNN variant) with a single layer and a structurally more controlled cell state perform slightly worse in training loss (compared to standard one-and two-layered LSTM models) but better on linguistically critical contrasts. This suggests that standard loss/accuracy metrics in autoregressive training procedures are linguistically irrelevant and, more generally, misleading since the best-trained models produce poorer linguistic predictions ([2], pace [3]). Overall, the performance of these models remains significantly lower compared to that of 7-year-old nativespeaker children in the relevant linguistic contrasts we considered <ref type="bibr" target="#b3">[4]</ref>.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>According to the mainstream LLM development pipeline, Transformer-based architectures <ref type="bibr" target="#b4">[5]</ref> outperform sequential training models, like LSTM <ref type="bibr" target="#b5">[6]</ref>, in various NLP tasks. When small-sized training data are available, optimization becomes necessary <ref type="bibr" target="#b6">[7]</ref>, <ref type="bibr" target="#b7">[8]</ref>, but common optimization techniques neglect the linguistically relevant fact that these models (i) conflate semantic/world knowledge with morpho-syntactic competence, (ii) require unreasonable training data compared to that needed by children during language acquisition, (iii) the higher their performance, the lower their return in cognitive/linguistic terms <ref type="bibr" target="#b8">[9]</ref>. In this paper we address these three issues, starting from the observation that while world knowledge uses all training data available, and the more the better, structural (morpho-syntactic and compositional semantic) knowledge might require a much smaller dataset (from 10 to 100 million words, according to <ref type="bibr" target="#b9">[10]</ref>). We explore this intuition further and, based on prolific literature from the '80s showing that typical child errors are structurally sensitive and never random <ref type="bibr" target="#b10">[11]</ref>, we model networks' architecture to bias learning towards plausible structural configurations, possibly preventing these "small" language models (SLM) from producing wrong linguistic generalizations. We started from a mild revision of the LM training and evaluation pipeline for Italian including alternative approaches to tokenization based on pseudo-morphological decomposition ( §2.2); we then approached a more structurally-driven update of the cell state in LSTM networks, which we will call eMG-RNN variants ( §2.3); we finally adopted a precise testing benchmark for specific linguistic contrasts in Italian following BLiMP design <ref type="bibr" target="#b11">[12]</ref> ( §2.4). We will first set the stage in section ( §2) and discuss one alternative tokenization strategy (MorPiece). A simple modification to the gating system in LSTM is proposed that mimics certain linguistic constraints. Then, we will describe the relevant experiments we have run ( §3) and draw some conclusions based on the observed results ( §4). A general discussion with a description of the next steps will conclude this paper ( §5).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Revisiting LM training pipeline</head><p>LM training pipeline is relatively rigid: after corpus cleaning (i), the data are prepared/optimized for tokenization (ii), then the tokenized input is batched for training autoregressive models (iii), mostly feeding transformer-based architectures (iv). Once the models are trained, the evaluation step requires their assessment using some standard tasks (v). In the next sub-sections, we will identify various criticalities in this pipeline, eventually proposing strategies to mitigate these problems and, in the end, training linguistically more informative SLM.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1.">Corpus creation and cleaning</head><p>The primary data we collected for Italian replicates plausible linguistic input that children may be exposed to during acquisition, in line with <ref type="bibr" target="#b0">[1]</ref>. It consists of about 3M tokens divided into child-directed speech (CHILDES Italian section), child movie subtitles (from OpenSubtitles), child songs (from Zecchino D'Oro repository), telephone conversations (VoLIP corpus, <ref type="bibr" target="#b12">[13]</ref>), and fairy tales (all from copyright expired sources). Simple cleaning consisted of removing children's productions from CHILDES files as well as any other metalinguistic annotation (speakers' identification, headers, time stamps, tags, links, etc.). Dimension and rough lexical richness of each section are reported in Table <ref type="table">1</ref> (Type-Token Ratio, TTR) before and after the cleaning procedure.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 1</head><p>Corpus profiling before (bc) and after (ac) cleaning.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2.">Tokenization: MorPiece (MoP)</head><p>Popular vLLMs use either Byte-Pair Encoding (BPE) <ref type="bibr" target="#b13">[14]</ref>, <ref type="bibr" target="#b14">[15]</ref> or (fast)WordPiece (fWP) <ref type="bibr" target="#b15">[16]</ref> algorithms for tokenization. The simplicity and computational efficiency of these approaches contrast with the limited morphological analysis they provide. In rich inflectional languages (e.g., Italian) and agglutinative languages (e.g., Finnish), this might induce linguistically unsound generalizations. Here, we explore a more morphologically informed strategy, inspired by the Tolerance Principle (TP) and Sufficiency Principle (SP) <ref type="bibr" target="#b16">[17]</ref>, aiming to break words into potentially relevant morphemes without relying on morpheme tables <ref type="bibr" target="#b17">[18]</ref>. The experiments we conduct compare the impact of different strategies when integrated into various network architectures. We refer to MorPiece (MoP) as a TP/SP-based strategy, which can be algorithmically described as follows: each token is traversed from left to right to create a "root trie," and from right to left to create an "inflectional trie" <ref type="bibr" target="#b18">[19]</ref>. Each time a node N of the trie is traversed (corresponding to the current character path in the word), the frequency counter associated with this node (Nc) is updated (+1). Nodes corresponding to token endings (characters before white spaces or punctuation) are flagged. Once both tries are created, the optimization procedure explores each descendant, and for every daughter node Dk its frequency k is compared to HN, the approximation of the harmonic number for N used both in TP and SP <ref type="bibr" target="#b16">[17]</ref>, where c is the frequency of the mother node Nc:</p><formula xml:id="formula_0">HN = c/ln(c) (F1)</formula><p>If k &gt; HN and c ≠ k, a productive boundary break is postulated (based on the inference that since there are different continuations and some of them are productive, i.e. sufficiently frequent according to SP, those might be real independent morphemes). We can check if this break respects HD for the relevant nodes Dj and Ni in the "inflectional trie". This means there exists a path where the frequency i of the daughter node Ni (in the "inflectional trie" the dependency between D and N is reversed) is lower than j/ln(j), where j is the frequency of the mother node Dj. If this is the case, the continuation is not considered "an exception", in the sense of TP <ref type="bibr" target="#b16">[17]</ref>, suggesting that the continuation is, in fact, a productive independent morpheme. A "++" root node is then activated, the node Dk linked to it, and so on recursively, following the FastWordPiece tokenization strategy <ref type="bibr" target="#b19">[20]</ref>.</p><p>During recognition, the LinMaxMatch identification approach is adopted, as in FastWordPiece. Figure <ref type="figure" target="#fig_0">1</ref> illustrates the relevant morpheme breaks (indicated as "||") obtained by applying this morpheme-breaking procedure in the root and infl tries fragments. Various parametric controls have been considered to tune this procedure: (i) a branching factor (bf) parameter that excludes nodes with an excessively high number (&gt; bf) of continuations (the rationale being that when too many continuations are present, they are unlikely to correspond to inflections; this often happens near the root of each trie); (ii) a cutoff parameter indicating the lower frequency boundary for a mother node (this is necessary to ensure a minimum number of observations; for example, if cutoff = 8, we exclude from the "root" trie any branching daughter with a frequency &lt; 5). As in BPE, minimum frequency control for tokens is also implemented to exclude infrequent dictionary entries. Consider the word "cerca" ("to search for") represented in the "root" trie. In the last "c-a" the relation between Hfc and "a" frequency indicates that a break might exist between the nodes "c" (frequency=1813) and "a" (frequency=1307), since Hfc = 1813/ln(1813) and 1307 &gt; Hfc. This hypothesis is confirmed by the failure of the Hfc check at the relevant "infl" "a-c" segment ("a" frequency=10121, "c" frequency=466619): 10121 &lt; 466619/ln(466619). If Hfc had been greater than "a" frequency, then no segmentation advantage would have been observable.</p><p>The proposed algorithm has a linear time complexity of O(2n), as each trie must be explored deterministically exactly once to evaluate the HN/D frequency relation. The best linguistic results (relatively linguistically coherent segmentations) for our Italian corpus were obtained with cutoff=100 and bf=10. We found that it was unnecessary to filter the proposed inflectional breaks using the infl trie double check (TP) since the LinMaxMatch strategy already efficiently filtered out initially overestimated breaks. However, as an anonymous reviewer correctly pointed out, this strategy does not guarantee total inclusion of every token of our training corpus (in contrast to BPE, for instance). We acknowledge this limitation, but we emphasize that our goal was to produce a smaller, potentially more efficient lexicon. In our experiments, while BPE generated a lexicon of 96028 tokens (67169 when the minimum lexical frequency was set to 2), MoP produced a lexicon of just 55049 tokens (cutoff=100, bf=10).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3.">Revisiting LSTM architecture</head><p>Despite many variants of the standard LSTM architectures, notably Gated Recurrent Units <ref type="bibr" target="#b20">[21]</ref> or LSTM augmented with peephole connections <ref type="bibr" target="#b21">[22]</ref>, and the discouraging equivalence results for these variations <ref type="bibr" target="#b22">[23]</ref>, we observe a recent revival of RNN-based model architectures <ref type="bibr" target="#b23">[24]</ref>. We believe, in fact, that the core intuition behind the LSTM architecture may be linguistically relevant and worth exploring further, although generally more performant models (for instance in terms of GLUE benchmark, <ref type="bibr" target="#b24">[25]</ref>) are usually preferred <ref type="bibr" target="#b25">[26]</ref>. The linguistic intuition is that the "longterm memory" (cell state C in Figure <ref type="figure" target="#fig_3">2</ref>) in LSTM networks could effectively model various types of nonlocal dependencies using a single mechanism. Linguistically speaking, filler-gap dependencies (1) and co-referential dependencies (2) are both "non-local dependencies" but they are subject to non-identical locality conditions: (2) a. [il panino]i, chi credi che loi abbia mangiato? the sandwich, who (you) believe it has eaten? b. *[il panino]i, chi credi che _i abbia mangiato? the sandwich, who (you) believe has eaten? the sandwich, who do you believe have eaten *(it)?</p><p>While both dependencies require C(onstituent)command generalizations to be captured <ref type="bibr" target="#b26">[27]</ref>, the adjunct island in (1), <ref type="bibr" target="#b27">[28]</ref>, but not clitic left-dislocation in (2), <ref type="bibr" target="#b28">[29]</ref>, can, for instance, be licensed with a(n extra) gap (1).b'. Aware of these differences, we decided to simply alter the gating system to allow the LSTM to create distinct pathways: one to "merge" new tokens, the other to decide if a long-distance dependency is necessary, and subsequently to "move" the relevant items <ref type="bibr" target="#b29">[30]</ref>. The processing implementation of these operations is inspired by expectation-based Minimalist Grammars formalism, eMG <ref type="bibr" target="#b30">[31]</ref>, and it is then named eMG-RNN. Following this implementation, merge applies incrementally, token by token, and move means "retain in memory". In more detail, the cell of an eMG-RNN network performs the forward processing described in the computational graph in Figure <ref type="figure" target="#fig_3">2</ref>: (i) the input at time t (xt) is linearly transformed to a lower dimension vector (E, loosely used for "embedding"), then concatenated (C) with the previous hidden state/output, if any (ht-1). Two pathways, both transformed using a sigmoid function (σ), lead, on the one hand, to the move gate, on the other, to the merge gate. In the first case, the result of the sigmoid transformation is multiplied (⊙, the Hadamard product) with the input (this either erases or allows some component of the original vector to be added (+) to the previous (if any) context/cell state (ct-1) as in LSTM forget gate). The merge gate, on the other direction, will privilege the new token if the result of the sigmoid combination of the incoming token and the previous hidden state is low, otherwise (1 -this activation, as in GRUs update gate) will favor items in the context/cell state (transformed through a tanh function to simulate memory decay). This architecture is the most performant compared to various alternatives tested for the BabyLM 2024 challenge <ref type="bibr" target="#b32">[32]</ref>.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4.">A linguistically informed evaluation</head><p>The last step in the pipeline requires a linguistically advanced set of oppositions to verify that the structural generalizations can be captured coherently. We adopted the lm-eval package <ref type="bibr" target="#b33">[33]</ref> and we included a specific task based on English BLiMP <ref type="bibr" target="#b11">[12]</ref>. Most of the contrasts are derived from the COnVERSA test <ref type="bibr" target="#b3">[4]</ref>. They consist of minimal pairs ordered following an increasing complexity metric that considers the number of operations necessary to establish a dependency and the locality of such dependency. The examples below illustrate this point by comparing a local agreement dependency with, (3).b, or without, (3).a, a (linear) intervener and a more complex dependency that requires to process an object relative clause (4): The one who the students listen to Vs. Quello che ascolta gli studenti.</p><p>The one who listens to the students Four kinds of dependency (agreement, thematic role assignment, pronominal forms usage, questions formation and answering) are considered for a set of 32 distinct syntactic configurations (a total of 344 minimal pairs to be judged, <ref type="bibr" target="#b3">[4]</ref>).</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Materials and Methods</head><p>We trained our models on the IUSS High-Performance Cluster with 2 GPU nodes, each with 4 A100 NVIDIA devices and 1T RAM. Each network has been trained with the full corpus using various batched strategies. CUDA drivers v.12.4 were used. The most relevant configurations tested are discussed in the next session.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.">Configurations tested</head><p>Three different tokenization strategies (BPE, FastWordPiece, and MorPiece) are compared using the best-performing LSTM network <ref type="bibr" target="#b35">[35]</ref> , which consists of 650 units for the embedding layer and 650 nodes for each of the two hidden layers. Five different network architectures are compared, with the GroNLP GPT-2small pretrained model <ref type="bibr" target="#b36">[36]</ref> constituting our "top LLM performer". This model was re-adapted to Italian from the GPT-2 English trained model, which was originally trained on approximately 10 billion token corpus, namely various orders of magnitude bigger than our corpus. We then trained on our corpus a comparable bidirectional transformer (BERT), two LSTM networks, respectively with 1 and 2 LSTM layers, and a one-layer eMG-RNN network (Table <ref type="table">2</ref>), as described in §2.3.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 2</head><p>Network architectures</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Results</head><p>Comparing BERT and LSTM architectures, LSTMx1 qualifies as the most performant configuration (both in training and in minimal pair judgments). Considering training, the only batching regimen performing sufficiently well is the fixed sequence length (loss=0.8877 with LSTMx1 vs. conversational loss=4.0240 or naturalistic regimen loss=4.5884). All networks reached a learning plateau around 10-12 epochs. Comparing the performances on COnVERSA, we realized that the results does not improve after 3 epochs of fixed sequence length (60 tokens) training regimen (this result is compatible with the overfitting hypothesis, <ref type="bibr" target="#b37">[37]</ref>). Focusing on tokenizer training results with LSTMx1, we observed that BPE and FastWordPiece have comparable performance. MorPiece performs slightly worse, even though the tokenization seems linguistically more coherent (e.g., "farlo" -"to do it" is tokenized both by BPE and fWP as a single token, while it is split in two in MorPiece: "far" "+lo") and the training faster (Table <ref type="table">3</ref>). This, however, only marginally impacts on minimal pairs contrast judgments, performing slightly better, overall, just in certain agreement cases.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Table 3 Impact of the tokenization strategy on LSTM training</head><p>We then adopted the BPE tokenizer for architectural comparisons. Network training performances are summarized in Table <ref type="table" target="#tab_1">4</ref> and graphically represented in Figure <ref type="figure" target="#fig_4">3</ref> for linguistic dimensions comparison. </p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Discussion</head><p>Overall, LSTM networks significantly outperform Bidirectional Transformers in this minimal pairs test on Italian. This finding is consistent with results previously discussed in the literature and suggests a clear advantage of recurrent, sequential model architectures (e.g., LSTM) over Bidirectional Transformers in terms of linguistic generalizations <ref type="bibr" target="#b38">[38]</ref> and partially justify the renewed interest for RNN networks that we have been observed in the last couple of years <ref type="bibr" target="#b23">[24]</ref>, <ref type="bibr" target="#b25">[26]</ref>. As far as the tokenization procedure is concerned, it is somewhat premature to draw definitive conclusions from our experiments, as MorPiece has not yet been fully optimized or tested. Specifically, the optimal cut-off threshold and minimum branching factor have not been systematically evaluated. Nevertheless, a more morphologically coherent segmentation is expected to enhance sensitivity in certain minimal contrasts.</p><p>Similarly, the eMG-RNN architecture could be further explored and optimized, particularly considering specific contrasts, which may help determine whether our linguistic modeling is on the right track. Evidence to the contrary is attested by the judgments of sentences with missing thematic roles, which are often incorrectly preferred by most models, including our eMG-RNN.</p><p>In the end, our results suggest that Loss/Accuracy performance registered in training is not a significant predictor of the performance on the COnVERSA test, or more generally, of the linguistic coherence of the LM trained. Likewise, the models' dimension is not a clear predictor either: Transformers trained on the same small dataset perform randomly (in all dimensions their performance is round 50%) while eMG-RNN, which has a number of parameters similar to LSTM-2, outperforms both LSTM-2 and LSTM-1 (half size of eMG-RNN). The training size remains a striking difference compared to the input received by children: this difference of one order of magnitude suggests that the bias considered in eMG-RNN are not yet satisfactory and that our Language Acquisition Device is still more efficient; in this sense, the Poverty of Stimulus Hypothesis remains unrefuted <ref type="bibr" target="#b39">[39]</ref> by these results. Next steps will consider extending to 10M tokens the training corpus (to match the English counterpart <ref type="bibr" target="#b0">[1]</ref>) and further exploring the effects of optimized tokenization procedures or other minimal modifications, and optimizations <ref type="bibr" target="#b23">[24]</ref>, of recurrent neural networks.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Visualization of a fragment of the "root" and the "infl(ectional)" trie created by MorPiece on our corpus (cutoff=100, bf=10).</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>( 1 )</head><label>1</label><figDesc>a. cosa i credi che abbia riposto _ i? what (you) believe that (he) shelved? what do you believe he shelved? b. *cosa i credi che abbia riposto il libro [AdvP senza leggere _ i]]? b'. cosa i credi che abbia riposto _ i [AdvP senza leggere _ i]]? what do you believe he shelved (*the book) without reading?</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: eMG-RNN cell computational graph.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>( 3 )</head><label>3</label><figDesc>a. Il piatto è pieno. Vs. Il piatto è piena. the dish.S.M is full.S.M … full.S.F b. Il muro della casa è rosso the wall.S.M of the house is red.S.M Vs. Il muro della casa è rossa. the wall.S.M of the house is red.S.F (4) Ci sono due maestri. Uno insegna ed è ascoltato dagli studenti, l'altro si riposa. Quale maestro insegna? There are two teachers. One teaches and he's listened to by the students, the other rests. Which one teaches? Quello che gli studenti ascoltano.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head></head><label></label><figDesc>(i) Naturalistic, line-by-line, single exposure to each sentence in the corpus (each epoch corresponds to an exposure of about 3M tokens); (ii) Conversational, two sequential lines are used for the input, that is, [line 1, line 2], [line 2, line 3], etc. are batched; this guarantees that a minimal conversational context for each sentence is provided. In this case, each epoch corresponds to an exposure of 6M tokens; (iii) fixed sequence length, considering the average sentence length of 54 words per sentence, a window of 60 tokens is used, that is, [tok_1, tok_2 … tok_60], [tok_2, tok_3 … tok_61] … are batched; with this regimen, each epoch corresponds to an exposure of 180M tokens. Roughly speaking, the bare amount of data processed by a 7 y.o. child ranges from 7 to 70M tokens, [34], then training the networks with a naturalistic or conversational regimen for 3-10 epochs would result in a comparable exposure. We trained the .optim.lr_scheduler (step_size=5, gamma=0.1) and Adam optimizer (lr=0.001) with 16-bit automatic mixed-precision to speed up the (parallel) training for a maximum of 100 epochs. The networks have been implemented in PyTorch (v2.3.1), wrapped in Transformers structures (4.42.4) to maximize compatibility in the lm-eval (v.0.4.3) environment.</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_1"><head>Table 4</head><label>4</label><figDesc>Network architectures and their performance on training (Loss/Accuracy) and COnVERSA test</figDesc><table><row><cell>Model</cell><cell cols="3">Loss/Accuracy COnVERA</cell></row><row><cell>GroNLP GPT-2s</cell><cell></cell><cell></cell><cell>0.73 (±0.02)</cell></row><row><cell>BERT</cell><cell cols="3">4.5488/0.65471 0.43(±0.02)</cell></row><row><cell>LSTMx2</cell><cell cols="3">0.7849/0.8283 0.48(±0.03)</cell></row><row><cell>LSTMx1</cell><cell cols="3">0.8784/0.8103 0.52(±0.03)</cell></row><row><cell>eMG-RNN</cell><cell cols="3">0.9491/0.7815 0.61(±0.01)</cell></row><row><cell></cell><cell>Total</cell><cell></cell><cell></cell></row><row><cell>Reflexives in Psych V Wh questions + OR</cell><cell>1</cell><cell>Agr DP</cell><cell>Agr Subj-AP</cell></row><row><cell>Subj position in wh-questions</cell><cell>0.9</cell><cell></cell><cell>Agr Subj-AP + attract</cell></row><row><cell>Clitic pronouns dat</cell><cell>0.8</cell><cell></cell><cell>Agr Subj-V (unacc)</cell></row><row><cell>Clitic pronouns acc</cell><cell>0.7</cell><cell></cell><cell>Agr Subj-V (unerg)</cell></row><row><cell>Clitic pronouns</cell><cell>0.6</cell><cell></cell><cell>Agr V-Subj (unacc)</cell></row><row><cell></cell><cell>0.5</cell><cell></cell><cell></cell></row><row><cell>Wh questions + OR</cell><cell></cell><cell></cell><cell>Agr Subj-V (unerg) +</cell></row><row><cell>unamb</cell><cell>0.4</cell><cell></cell><cell>attract</cell></row><row><cell></cell><cell>0.3</cell><cell></cell><cell></cell></row><row><cell>Wh questions + SR</cell><cell></cell><cell></cell><cell>Agr Subj-V (trans)</cell></row><row><cell></cell><cell>0.2</cell><cell></cell><cell></cell></row><row><cell>Why questions</cell><cell>0.1</cell><cell></cell><cell>Agr Subj-V trans (unerg)</cell></row><row><cell></cell><cell>0</cell><cell></cell><cell></cell></row><row><cell>Wh arguments</cell><cell></cell><cell></cell><cell>Aux selection (ditrans)</cell></row><row><cell>Person rotation in decl</cell><cell></cell><cell></cell><cell>Aux selection (trans)</cell></row><row><cell>Person rotation in questions</cell><cell></cell><cell></cell><cell>Theta roles (subst)</cell></row><row><cell>Reflexives cl (unacc V)</cell><cell></cell><cell></cell><cell>Agr Past Part (unacc)</cell></row><row><cell>Reflexives cl</cell><cell></cell><cell></cell><cell>Aux selection (unacc)</cell></row><row><cell>Aux selection (passives)</cell><cell></cell><cell></cell><cell>Aux selection (unerg)</cell></row><row><cell>Agr Subj V Wh adjuncts Polar questions</cell><cell></cell><cell cols="2">Theta roles Psych V (piacere) Psych V (omission)</cell></row><row><cell>(cumulative)</cell><cell>Agr Past Part (with</cell><cell cols="2">(preoccupare)</cell></row><row><cell></cell><cell>clitic)</cell><cell></cell><cell></cell></row><row><cell>LSTMx1</cell><cell>Emg_BPE</cell><cell></cell><cell>7 y.o. child</cell></row><row><cell cols="4">Figure 3: Performance of the 2 best RNN networks</cell></row><row><cell cols="4">variants on COnVERSA compared to the 7 y.o. children.</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head>Acknowledgments</head><p>This project is partially supported by the T-GRA2L: Testing GRAdeness and GRAmmaticality in Linguistics, PRIN 2022 Next Generation EU funded Project (202223PL4N). National coordinator: CC</p></div>
			</div>


			<div type="funding">
<div xmlns="http://www.tei-c.org/ns/1.0"><p>(C. Chesi) 0000-0002-5389-8884 (A. Fusco); 0009-0007-7986-2365 (M. Barbini); 0009-0005-8116-3358 (M. L. Piccini Bianchessi); 0000-0003-3072-7967 (V. Bressan); 0009-0003-5456-0556 (S. Neri); 0009-0007-2525-2457 (S. Rossi); 0000-0003-1375-1359 (T. Sgrizzi); 0000-0003-1935-1348 (C. Chesi);</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<monogr>
		<author>
			<persName><forename type="first">A</forename><surname>Warstadt</surname></persName>
		</author>
		<ptr target="https://aclanthology.org/2023.conll-babylm.0" />
		<title level="m">Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning</title>
				<meeting>the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
	<note>: Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b1">
	<monogr>
		<title level="m" type="main">Why large language models are poor theories of human linguistic cognition</title>
		<author>
			<persName><forename type="first">R</forename><surname>Katzir</surname></persName>
		</author>
		<idno>lingbuzz/007190</idno>
		<imprint>
			<date type="published" when="2023">2023. 2023</date>
		</imprint>
	</monogr>
	<note>A reply to Piantadosi</note>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">Modern language models refute Chomsky&apos;s approach to language</title>
		<author>
			<persName><forename type="first">S</forename><surname>Piantadosi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Lingbuzz Preprint, lingbuzz</title>
		<imprint>
			<biblScope unit="volume">7180</biblScope>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<monogr>
		<title level="m" type="main">COnVERSA: Test di Comprensione delle Opposizioni morfo-sintattiche VERbali attraverso la ScritturA</title>
		<author>
			<persName><forename type="first">C</forename><surname>Chesi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Ghersi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">V</forename><surname>Musella</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Musola</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2024">2024</date>
			<publisher>Hogrefe</publisher>
			<pubPlace>Firenze</pubPlace>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<monogr>
		<title level="m" type="main">Attention Is All You Need</title>
		<author>
			<persName><forename type="first">A</forename><surname>Vaswani</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1706.03762</idno>
		<ptr target="http://arxiv.org/abs/1706.03762" />
		<imprint>
			<date type="published" when="2017-12">Dec. 2017. Mar. 26, 2022</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Long shortterm memory</title>
		<author>
			<persName><forename type="first">S</forename><surname>Hochreiter</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Neural computation</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="issue">8</biblScope>
			<biblScope unit="page" from="1735" to="1780" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Not all layers are equally as important: Every Layer Counts BERT</title>
		<author>
			<persName><forename type="first">L</forename><forename type="middle">G G</forename><surname>Charpentier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Samuel</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.conll-babylm.20</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning</title>
				<meeting>the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="210" to="224" />
		</imprint>
	</monogr>
	<note>: Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b7">
	<monogr>
		<title level="m" type="main">FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness</title>
		<author>
			<persName><forename type="first">T</forename><surname>Dao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><forename type="middle">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Ermon</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Rudra</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ré</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2205.14135</idno>
		<ptr target="http://arxiv.org/abs/2205.14135" />
		<imprint>
			<date type="published" when="2022-06-23">Jun. 23, 2022. Jun. 12, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Large GPTlike Models are Bad Babies: A Closer Look at the Relationship between Linguistic Competence and Psycholinguistic Measures</title>
		<author>
			<persName><forename type="first">J</forename><surname>Steuer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Mosbach</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Klakow</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2023.conll-babylm.12</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning</title>
				<meeting>the BabyLM Challenge at the 27th Conference on Computational Natural Language Learning<address><addrLine>Singapore</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2023">2023</date>
			<biblScope unit="page" from="114" to="129" />
		</imprint>
	</monogr>
	<note>: Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b9">
	<monogr>
		<title level="m" type="main">When Do You Need Billions of Words of Pretraining Data?</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Warstadt</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H.-S</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">R</forename><surname>Bowman</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2011.04946</idno>
		<ptr target="http://arxiv.org/abs/2011.04946" />
		<imprint>
			<date type="published" when="2020-11-10">Nov. 10, 2020. Jan. 10, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Structure Dependence in Grammar Formation</title>
		<author>
			<persName><forename type="first">S</forename><surname>Crain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nakayama</surname></persName>
		</author>
		<idno type="DOI">10.2307/415004</idno>
	</analytic>
	<monogr>
		<title level="j">Language</title>
		<imprint>
			<biblScope unit="volume">63</biblScope>
			<biblScope unit="issue">3</biblScope>
			<biblScope unit="page">522</biblScope>
			<date type="published" when="1987-09">Sep. 1987</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">BLiMP: The Benchmark of Linguistic Minimal Pairs for English</title>
		<author>
			<persName><forename type="first">A</forename><surname>Warstadt</surname></persName>
		</author>
		<idno type="DOI">10.1162/tacl_a_00321</idno>
	</analytic>
	<monogr>
		<title level="j">Transactions of the Association for Computational Linguistics</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<biblScope unit="page" from="377" to="392" />
			<date type="published" when="2020-12">Dec. 2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">VOLIP: a corpus of spoken Italian and a virtuous example of reuse of linguistic resources</title>
		<author>
			<persName><forename type="first">I</forename><surname>Alfano</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Cutugno</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><forename type="middle">De</forename><surname>Rosa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Iacobini</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Savy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Voghera</surname></persName>
		</author>
		<ptr target="http://www.lrec-conf.org/proceedings/lrec2014/pdf/906_Paper.pdf" />
	</analytic>
	<monogr>
		<title level="m">Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC&apos;14</title>
				<editor>
			<persName><forename type="first">N</forename><surname>Calzolari</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">K</forename><surname>Choukri</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">T</forename><surname>Declerck</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">H</forename><surname>Loftsson</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">B</forename><surname>Maegaard</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Mariani</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">A</forename><surname>Moreno</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">J</forename><surname>Odijk</surname></persName>
		</editor>
		<editor>
			<persName><forename type="first">S</forename><surname>Piperidis</surname></persName>
		</editor>
		<meeting>the Ninth International Conference on Language Resources and Evaluation (LREC&apos;14<address><addrLine>Reykjavik, Iceland</addrLine></address></meeting>
		<imprint>
			<publisher>ELRA</publisher>
			<date type="published" when="2014-05">May 2014</date>
			<biblScope unit="page" from="3897" to="3901" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<monogr>
		<title level="m" type="main">Language Models are Few-Shot Learners</title>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">B</forename><surname>Brown</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2005.14165</idno>
		<ptr target="http://arxiv.org/abs/2005.14165" />
		<imprint>
			<date type="published" when="2020-07">Jul. 2020. Apr. 21, 2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">A new algorithm for data compression</title>
		<author>
			<persName><forename type="first">P</forename><surname>Gage</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">C Users Journal</title>
		<imprint>
			<biblScope unit="volume">12</biblScope>
			<biblScope unit="issue">2</biblScope>
			<biblScope unit="page" from="23" to="38" />
			<date type="published" when="1994">1994</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<monogr>
		<title level="m" type="main">Bert: Pre-training of deep bidirectional transformers for language understanding</title>
		<author>
			<persName><forename type="first">J</forename><surname>Devlin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M.-W</forename><surname>Chang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Lee</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Toutanova</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1810.04805</idno>
		<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b16">
	<monogr>
		<title level="m" type="main">The price of linguistic productivity: how children learn to break the rules of language</title>
		<author>
			<persName><forename type="first">C</forename><forename type="middle">D</forename><surname>Yang</surname></persName>
		</author>
		<imprint>
			<date type="published" when="2016">2016</date>
			<publisher>MIT Press</publisher>
			<pubPlace>Cambridge, MA</pubPlace>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<monogr>
		<title level="m" type="main">MorphPiece : A Linguistic Tokenizer for Large Language Models</title>
		<author>
			<persName><forename type="first">H</forename><surname>Jabbar</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2307.07262</idno>
		<ptr target="http://arxiv.org/abs/2307.07262" />
		<imprint>
			<date type="published" when="2024-02-03">Feb. 03, 2024. Jun. 23, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">Trie memory</title>
		<author>
			<persName><forename type="first">E</forename><surname>Fredkin</surname></persName>
		</author>
		<idno type="DOI">10.1145/367390.367400</idno>
	</analytic>
	<monogr>
		<title level="j">Commun. ACM</title>
		<imprint>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="issue">9</biblScope>
			<biblScope unit="page" from="490" to="499" />
			<date type="published" when="1960-09">Sep. 1960</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<monogr>
		<title level="m" type="main">Fast WordPiece Tokenization</title>
		<author>
			<persName><forename type="first">X</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Salcianu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Dopson</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Zhou</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2012.15524</idno>
		<ptr target="http://arxiv.org/abs/2012.15524" />
		<imprint>
			<date type="published" when="2021-10-05">Oct. 05, 2021. Jun. 13, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<title level="m" type="main">Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation</title>
		<author>
			<persName><forename type="first">K</forename><surname>Cho</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1406.1078</idno>
		<ptr target="http://arxiv.org/abs/1406.1078" />
		<imprint>
			<date type="published" when="2014-09-02">Sep. 02, 2014. Jun. 12, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Recurrent nets that time and count</title>
		<author>
			<persName><forename type="first">F</forename><forename type="middle">A</forename><surname>Gers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
		<idno type="DOI">10.1109/IJCNN.2000.861302</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE-INNS-ENNS International Joint Conference on Neural Networks. IJCNN 2000. Neural Computing: New Challenges and Perspectives for the New Millennium</title>
				<meeting>the IEEE-INNS-ENNS International Joint Conference on Neural Networks. IJCNN 2000. Neural Computing: New Challenges and Perspectives for the New Millennium<address><addrLine>Como, Italy</addrLine></address></meeting>
		<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2000">2000</date>
			<biblScope unit="volume">3</biblScope>
			<biblScope unit="page" from="189" to="194" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">LSTM: A Search Space Odyssey</title>
		<author>
			<persName><forename type="first">K</forename><surname>Greff</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">K</forename><surname>Srivastava</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Koutník</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><forename type="middle">R</forename><surname>Steunebrink</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Schmidhuber</surname></persName>
		</author>
		<idno type="DOI">10.1109/TNNLS.2016.2582924</idno>
	</analytic>
	<monogr>
		<title level="j">IEEE Trans. Neural Netw. Learning Syst</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="issue">10</biblScope>
			<biblScope unit="page" from="2222" to="2232" />
			<date type="published" when="2017-10">Oct. 2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<monogr>
		<title level="m" type="main">Were RNNs All We Needed?</title>
		<author>
			<persName><forename type="first">L</forename><surname>Feng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Tung</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><forename type="middle">O</forename><surname>Ahmed</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Bengio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Hajimirsadegh</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2410.01201</idno>
		<ptr target="http://arxiv.org/abs/2410.01201" />
		<imprint>
			<date type="published" when="2024-10-04">Oct. 04, 2024. Oct. 18, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<monogr>
		<title level="m" type="main">GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding</title>
		<author>
			<persName><forename type="first">A</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Singh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Michael</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Hill</surname></persName>
		</author>
		<author>
			<persName><forename type="first">O</forename><surname>Levy</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><forename type="middle">R</forename><surname>Bowman</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1804.07461</idno>
		<ptr target="http://arxiv.org/abs/1804.07461" />
		<imprint>
			<date type="published" when="2019-02-22">Feb. 22, 2019. Jul. 20, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b25">
	<monogr>
		<title level="m" type="main">Mamba: Linear-Time Sequence Modeling with Selective State Spaces</title>
		<author>
			<persName><forename type="first">A</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Dao</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2312.00752</idno>
		<ptr target="http://arxiv.org/abs/2312.00752" />
		<imprint>
			<date type="published" when="2024-05-31">May 31, 2024. Oct. 20, 2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<title level="m" type="main">The syntactic domain of anaphora</title>
		<author>
			<persName><forename type="first">T</forename><surname>Reinhart</surname></persName>
		</author>
		<imprint>
			<date type="published" when="1976">1976</date>
			<publisher>Massachusetts Institute of Technology</publisher>
			<pubPlace>Cambridge (MA</pubPlace>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b27">
	<monogr>
		<title level="m" type="main">Constraints on variables in syntax</title>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">R</forename><surname>Ross</surname></persName>
		</author>
		<imprint>
			<date type="published" when="1967">1967</date>
			<pubPlace>Cambridge (MA</pubPlace>
		</imprint>
		<respStmt>
			<orgName>MIT</orgName>
		</respStmt>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">A Comparative Analysis of Left and Right Dislocation in Romance</title>
		<author>
			<persName><forename type="first">C</forename><surname>Cecchetto</surname></persName>
		</author>
		<idno type="DOI">10.1111/1467-9582.00039</idno>
	</analytic>
	<monogr>
		<title level="j">Studia Linguistica</title>
		<imprint>
			<biblScope unit="volume">53</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page" from="40" to="67" />
			<date type="published" when="1999-04">Apr. 1999</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<monogr>
		<title level="m" type="main">Merge and the Strong Minimalist Thesis</title>
		<author>
			<persName><forename type="first">N</forename><surname>Chomsky</surname></persName>
		</author>
		<idno type="DOI">10.1017/9781009343244</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
			<publisher>Cambridge University Press</publisher>
		</imprint>
	</monogr>
	<note>1st ed</note>
</biblStruct>

<biblStruct xml:id="b30">
	<monogr>
		<title level="m" type="main">Expectation-based Minimalist</title>
		<author>
			<persName><forename type="first">C</forename><surname>Chesi</surname></persName>
		</author>
		<imprint/>
	</monogr>
</biblStruct>

<biblStruct xml:id="b31">
	<monogr>
		<idno type="arXiv">arXiv:2109.13871</idno>
		<ptr target="http://arxiv.org/abs/2109.13871" />
		<title level="m">Grammars</title>
				<imprint>
			<date type="published" when="2021-02">Sep. 2021. Nov. 02, 2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b32">
	<analytic>
		<title level="a" type="main">Different Ways to Forget: Linguistic Gates in Recurrent Neural Networks</title>
		<author>
			<persName><forename type="first">C</forename><surname>Chesi</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the BabyLM Challenge at the 28th Conference on Computational Natural Language Learning</title>
				<meeting>the BabyLM Challenge at the 28th Conference on Computational Natural Language Learning</meeting>
		<imprint>
			<date type="published" when="2024">2024</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b33">
	<analytic>
		<title level="a" type="main">A framework for few-shot language model evaluation</title>
		<author>
			<persName><forename type="first">L</forename><surname>Gao</surname></persName>
		</author>
		<idno type="DOI">10.5281/zenodo.10256836</idno>
	</analytic>
	<monogr>
		<title level="j">Zenodo</title>
		<imprint>
			<date type="published" when="2023-12">Dec. 2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b34">
	<analytic>
		<title level="a" type="main">American parenting of language-learning children: Persisting differences in family-child interactions observed in natural home environments</title>
		<author>
			<persName><forename type="first">B</forename><surname>Hart</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><forename type="middle">R</forename><surname>Risley</surname></persName>
		</author>
		<idno type="DOI">10.1037/0012-1649.28.6.1096</idno>
	</analytic>
	<monogr>
		<title level="j">Developmental Psychology</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="issue">6</biblScope>
			<biblScope unit="page" from="1096" to="1105" />
			<date type="published" when="1992-11">Nov. 1992</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b35">
	<analytic>
		<title level="a" type="main">Colorless Green Recurrent Networks Dream Hierarchically</title>
		<author>
			<persName><forename type="first">K</forename><surname>Gulordava</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Bojanowski</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Grave</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Linzen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Baroni</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/N18-1108</idno>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</title>
		<title level="s">Long Papers</title>
		<meeting>the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies<address><addrLine>New Orleans, Louisiana</addrLine></address></meeting>
		<imprint>
			<date type="published" when="2018-06">Jun. 2018</date>
			<biblScope unit="volume">1</biblScope>
			<biblScope unit="page" from="1195" to="1205" />
		</imprint>
	</monogr>
	<note>Association for Computational Linguistics</note>
</biblStruct>

<biblStruct xml:id="b36">
	<analytic>
		<title level="a" type="main">As Good as New. How to Successfully Recycle English GPT-2 to Make Models for Other Languages</title>
		<author>
			<persName><forename type="first">W</forename><surname>Vries</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Nissim</surname></persName>
		</author>
		<idno type="DOI">10.18653/v1/2021.findings-acl.74</idno>
	</analytic>
	<monogr>
		<title level="m">Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021</title>
				<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="836" to="846" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b37">
	<monogr>
		<title level="m" type="main">To Repeat or Not To Repeat: Insights from Scaling LLM under Token-Crisis</title>
		<author>
			<persName><forename type="first">F</forename><surname>Xue</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>You</surname></persName>
		</author>
		<idno type="DOI">10.48550/ARXIV.2305.13230</idno>
		<imprint>
			<date type="published" when="2023">2023</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b38">
	<analytic>
		<title level="a" type="main">Using Computational Models to Test Syntactic Learnability</title>
		<author>
			<persName><forename type="first">E</forename><surname>Wilcox</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Futrell</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Levy</surname></persName>
		</author>
		<idno type="DOI">10.1162/ling_a_00491</idno>
	</analytic>
	<monogr>
		<title level="m">Linguistic Inquiry</title>
				<imprint>
			<date type="published" when="2023-04">Apr. 2023</date>
			<biblScope unit="page" from="1" to="44" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b39">
	<analytic>
		<title level="a" type="main">The growth of language: Universal Grammar, experience, and principles of computation</title>
		<author>
			<persName><forename type="first">C</forename><surname>Yang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Crain</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><forename type="middle">C</forename><surname>Berwick</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Chomsky</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><forename type="middle">J</forename><surname>Bolhuis</surname></persName>
		</author>
		<idno type="DOI">10.1016/j.neubiorev.2016.12.023</idno>
	</analytic>
	<monogr>
		<title level="j">Neuroscience &amp; Biobehavioral Reviews</title>
		<imprint>
			<biblScope unit="volume">81</biblScope>
			<biblScope unit="page" from="103" to="119" />
			<date type="published" when="2017-10">Oct. 2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b40">
	<monogr>
		<author>
			<persName><forename type="first">A</forename></persName>
		</author>
		<ptr target="https://github.com/cristianochesi/babylm-2024" />
		<title level="m">Online Resources Resources (corpus information, tokenizer, network architectures and lm_eval tasks)</title>
				<imprint/>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
